crimson/os/seastore: implement generational GC

Place extents into the dedicated RecordSubmitter by their data-category
and reclaimed-count. Segments of different data-category or
reclaimed-count should have different locality in the access patterns,
which is the foundation to form a desired bimodal distribution of
segment utilizations, so that GC can be more efficient.

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
This commit is contained in:
Yingxin Cheng 2022-05-27 17:13:06 +08:00
parent 12aade9b2c
commit 6b662cbd21
20 changed files with 390 additions and 167 deletions

View File

@ -13,13 +13,18 @@ SET_SUBSYS(seastore_cleaner);
namespace crimson::os::seastore {
void segment_info_t::set_open(
segment_seq_t _seq, segment_type_t _type)
segment_seq_t _seq, segment_type_t _type,
data_category_t _category, reclaim_gen_t _generation)
{
ceph_assert(_seq != NULL_SEG_SEQ);
ceph_assert(_type != segment_type_t::NULL_SEG);
ceph_assert(_category != data_category_t::NUM);
ceph_assert(_generation < RECLAIM_GENERATIONS);
state = Segment::segment_state_t::OPEN;
seq = _seq;
type = _type;
category = _category;
generation = _generation;
written_to = 0;
}
@ -28,6 +33,8 @@ void segment_info_t::set_empty()
state = Segment::segment_state_t::EMPTY;
seq = NULL_SEG_SEQ;
type = segment_type_t::NULL_SEG;
category = data_category_t::NUM;
generation = NULL_GENERATION;
last_modified = {};
last_rewritten = {};
written_to = 0;
@ -40,13 +47,19 @@ void segment_info_t::set_closed()
}
void segment_info_t::init_closed(
segment_seq_t _seq, segment_type_t _type, std::size_t seg_size)
segment_seq_t _seq, segment_type_t _type,
data_category_t _category, reclaim_gen_t _generation,
std::size_t seg_size)
{
ceph_assert(_seq != NULL_SEG_SEQ);
ceph_assert(_type != segment_type_t::NULL_SEG);
ceph_assert(_category != data_category_t::NUM);
ceph_assert(_generation < RECLAIM_GENERATIONS);
state = Segment::segment_state_t::CLOSED;
seq = _seq;
type = _type;
category = _category;
generation = _generation;
written_to = seg_size;
}
@ -59,6 +72,8 @@ std::ostream& operator<<(std::ostream &out, const segment_info_t &info)
} else { // open or closed
out << ", seq=" << segment_seq_printer_t{info.seq}
<< ", type=" << info.type
<< ", category=" << info.category
<< ", generation=" << reclaim_gen_printer_t{info.generation}
<< ", last_modified=" << info.last_modified.time_since_epoch()
<< ", last_rewritten=" << info.last_rewritten.time_since_epoch()
<< ", written_to=" << info.written_to;
@ -124,15 +139,19 @@ void segments_info_t::add_segment_manager(
}
void segments_info_t::init_closed(
segment_id_t segment, segment_seq_t seq, segment_type_t type)
segment_id_t segment, segment_seq_t seq, segment_type_t type,
data_category_t category, reclaim_gen_t generation)
{
LOG_PREFIX(segments_info_t::init_closed);
auto& segment_info = segments[segment];
INFO("initiating {} {} {}, {}, num_segments(empty={}, opened={}, closed={})",
INFO("initiating {} {} {} {} {}, {}, "
"num_segments(empty={}, opened={}, closed={})",
segment, segment_seq_printer_t{seq}, type,
category, reclaim_gen_printer_t{generation},
segment_info, num_empty, num_open, num_closed);
ceph_assert(segment_info.is_empty());
segment_info.init_closed(seq, type, get_segment_size());
segment_info.init_closed(
seq, type, category, generation, get_segment_size());
ceph_assert(num_empty > 0);
--num_empty;
++num_closed;
@ -147,15 +166,18 @@ void segments_info_t::init_closed(
}
void segments_info_t::mark_open(
segment_id_t segment, segment_seq_t seq, segment_type_t type)
segment_id_t segment, segment_seq_t seq, segment_type_t type,
data_category_t category, reclaim_gen_t generation)
{
LOG_PREFIX(segments_info_t::mark_open);
auto& segment_info = segments[segment];
INFO("opening {} {} {}, {}, num_segments(empty={}, opened={}, closed={})",
INFO("opening {} {} {} {} {}, {}, "
"num_segments(empty={}, opened={}, closed={})",
segment, segment_seq_printer_t{seq}, type,
category, reclaim_gen_printer_t{generation},
segment_info, num_empty, num_open, num_closed);
ceph_assert(segment_info.is_empty());
segment_info.set_open(seq, type);
segment_info.set_open(seq, type, category, generation);
ceph_assert(num_empty > 0);
--num_empty;
++num_open;
@ -531,7 +553,9 @@ void AsyncCleaner::register_metrics()
segment_id_t AsyncCleaner::allocate_segment(
segment_seq_t seq,
segment_type_t type)
segment_type_t type,
data_category_t category,
reclaim_gen_t generation)
{
LOG_PREFIX(AsyncCleaner::allocate_segment);
assert(seq != NULL_SEG_SEQ);
@ -542,7 +566,7 @@ segment_id_t AsyncCleaner::allocate_segment(
auto& segment_info = it->second;
if (segment_info.is_empty()) {
auto old_usage = calc_utilization(seg_id);
segments.mark_open(seg_id, seq, type);
segments.mark_open(seg_id, seq, type, category, generation);
auto new_usage = calc_utilization(seg_id);
adjust_segment_util(old_usage, new_usage);
INFO("opened, should_block_on_gc {}, projected_avail_ratio {}, "
@ -682,7 +706,7 @@ AsyncCleaner::rewrite_dirty_ret AsyncCleaner::rewrite_dirty(
dirty_list,
[this, FNAME, &t](auto &e) {
DEBUGT("cleaning {}", t, *e);
return ecb->rewrite_extent(t, e);
return ecb->rewrite_extent(t, e, DIRTY_GENERATION);
});
});
});
@ -867,11 +891,12 @@ AsyncCleaner::gc_reclaim_space_ret AsyncCleaner::gc_reclaim_space()
INFO("reclaim {} {} start", seg_id, segment_info);
ceph_assert(segment_info.is_closed());
reclaim_state = reclaim_state_t::create(
seg_id, segments.get_segment_size());
seg_id, segment_info.generation, segments.get_segment_size());
}
reclaim_state->advance(config.reclaim_bytes_per_cycle);
DEBUG("reclaiming {}~{}",
DEBUG("reclaiming {} {}~{}",
reclaim_gen_printer_t{reclaim_state->generation},
reclaim_state->start_pos,
reclaim_state->end_pos);
double pavail_ratio = get_projected_available_ratio();
@ -965,7 +990,7 @@ AsyncCleaner::gc_reclaim_space_ret AsyncCleaner::gc_reclaim_space()
extents,
[this, &t, &reclaimed](auto &ext) {
reclaimed += ext->get_length();
return ecb->rewrite_extent(t, ext);
return ecb->rewrite_extent(t, ext, reclaim_state->target_generation);
});
});
}).si_then([this, &t, &seq] {
@ -1074,7 +1099,9 @@ AsyncCleaner::mount_ret AsyncCleaner::mount()
init_mark_segment_closed(
segment_id,
header.segment_seq,
header.type);
header.type,
header.category,
header.generation);
return seastar::now();
}).handle_error(
crimson::ct_error::enodata::handle(
@ -1179,7 +1206,9 @@ AsyncCleaner::scan_extents_ret AsyncCleaner::scan_nonfull_segment(
init_mark_segment_closed(
segment_id,
header.segment_seq,
header.type);
header.type,
header.category,
header.generation);
return seastar::now();
});
}

View File

@ -37,6 +37,10 @@ struct segment_info_t {
segment_type_t type = segment_type_t::NULL_SEG;
data_category_t category = data_category_t::NUM;
reclaim_gen_t generation = NULL_GENERATION;
time_point last_modified;
time_point last_rewritten;
@ -59,9 +63,12 @@ struct segment_info_t {
return state == Segment::segment_state_t::OPEN;
}
void init_closed(segment_seq_t, segment_type_t, std::size_t);
void init_closed(segment_seq_t, segment_type_t,
data_category_t, reclaim_gen_t,
std::size_t);
void set_open(segment_seq_t, segment_type_t);
void set_open(segment_seq_t, segment_type_t,
data_category_t, reclaim_gen_t);
void set_empty();
@ -190,9 +197,11 @@ public:
void add_segment_manager(SegmentManager &segment_manager);
// initiate non-empty segments, the others are by default empty
void init_closed(segment_id_t, segment_seq_t, segment_type_t);
void init_closed(segment_id_t, segment_seq_t, segment_type_t,
data_category_t, reclaim_gen_t);
void mark_open(segment_id_t, segment_seq_t, segment_type_t);
void mark_open(segment_id_t, segment_seq_t, segment_type_t,
data_category_t, reclaim_gen_t);
void mark_empty(segment_id_t);
@ -241,7 +250,7 @@ public:
virtual const segment_info_t& get_seg_info(segment_id_t id) const = 0;
virtual segment_id_t allocate_segment(
segment_seq_t seq, segment_type_t type) = 0;
segment_seq_t, segment_type_t, data_category_t, reclaim_gen_t) = 0;
virtual journal_seq_t get_dirty_extents_replay_from() const = 0;
@ -597,7 +606,8 @@ public:
using rewrite_extent_ret = rewrite_extent_iertr::future<>;
virtual rewrite_extent_ret rewrite_extent(
Transaction &t,
CachedExtentRef extent) = 0;
CachedExtentRef extent,
reclaim_gen_t target_generation) = 0;
/**
* get_extent_if_live
@ -739,7 +749,7 @@ public:
}
segment_id_t allocate_segment(
segment_seq_t seq, segment_type_t type) final;
segment_seq_t, segment_type_t, data_category_t, reclaim_gen_t) final;
void close_segment(segment_id_t segment) final;
@ -935,14 +945,21 @@ private:
}
struct reclaim_state_t {
reclaim_gen_t generation;
reclaim_gen_t target_generation;
std::size_t segment_size;
paddr_t start_pos;
paddr_t end_pos;
static reclaim_state_t create(
segment_id_t segment_id,
reclaim_gen_t generation,
std::size_t segment_size) {
return {segment_size,
ceph_assert(generation < RECLAIM_GENERATIONS);
return {generation,
(reclaim_gen_t)(generation == RECLAIM_GENERATIONS - 1 ?
generation : generation + 1),
segment_size,
P_ADDR_NULL,
paddr_t::make_seg_paddr(segment_id, 0)};
}
@ -1280,10 +1297,12 @@ private:
void init_mark_segment_closed(
segment_id_t segment,
segment_seq_t seq,
segment_type_t s_type) {
segment_type_t s_type,
data_category_t category,
reclaim_gen_t generation) {
ceph_assert(!init_complete);
auto old_usage = calc_utilization(segment);
segments.init_closed(segment, seq, s_type);
segments.init_closed(segment, seq, s_type, category, generation);
auto new_usage = calc_utilization(segment);
adjust_segment_util(old_usage, new_usage);
if (s_type == segment_type_t::OOL) {

View File

@ -313,7 +313,9 @@ public:
static mkfs_ret mkfs(op_context_t<node_key_t> c) {
auto root_leaf = c.cache.template alloc_new_extent<leaf_node_t>(
c.trans,
node_size);
node_size,
placement_hint_t::HOT,
0);
root_leaf->set_size(0);
fixed_kv_node_meta_t<node_key_t> meta{min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, 1};
root_leaf->set_meta(meta);
@ -814,7 +816,9 @@ public:
std::remove_reference_t<decltype(fixed_kv_extent)>
>(
c.trans,
fixed_kv_extent.get_length());
fixed_kv_extent.get_length(),
fixed_kv_extent.get_user_hint(),
fixed_kv_extent.get_reclaim_generation());
fixed_kv_extent.get_bptr().copy_out(
0,
fixed_kv_extent.get_length(),
@ -1400,7 +1404,7 @@ private:
if (split_from == iter.get_depth()) {
auto nroot = c.cache.template alloc_new_extent<internal_node_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
fixed_kv_node_meta_t<node_key_t> meta{
min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, iter.get_depth() + 1};
nroot->set_meta(meta);

View File

@ -154,9 +154,9 @@ struct FixedKVInternalNode
std::tuple<Ref, Ref, NODE_KEY>
make_split_children(op_context_t<NODE_KEY> c) {
auto left = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
auto right = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
auto pivot = this->split_into(*left, *right);
left->pin.set_range(left->get_meta());
right->pin.set_range(right->get_meta());
@ -170,7 +170,7 @@ struct FixedKVInternalNode
op_context_t<NODE_KEY> c,
Ref &right) {
auto replacement = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
replacement->merge_from(*this, *right->template cast<node_type_t>());
replacement->pin.set_range(replacement->get_meta());
return replacement;
@ -184,9 +184,9 @@ struct FixedKVInternalNode
ceph_assert(_right->get_type() == this->get_type());
auto &right = *_right->template cast<node_type_t>();
auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
auto pivot = this->balance_into_new_nodes(
*this,
@ -355,9 +355,9 @@ struct FixedKVLeafNode
std::tuple<Ref, Ref, NODE_KEY>
make_split_children(op_context_t<NODE_KEY> c) {
auto left = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
auto right = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
auto pivot = this->split_into(*left, *right);
left->pin.set_range(left->get_meta());
right->pin.set_range(right->get_meta());
@ -371,7 +371,7 @@ struct FixedKVLeafNode
op_context_t<NODE_KEY> c,
Ref &right) {
auto replacement = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
replacement->merge_from(*this, *right->template cast<node_type_t>());
replacement->pin.set_range(replacement->get_meta());
return replacement;
@ -385,9 +385,9 @@ struct FixedKVLeafNode
ceph_assert(_right->get_type() == this->get_type());
auto &right = *_right->template cast<node_type_t>();
auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
c.trans, node_size, placement_hint_t::HOT, 0);
auto pivot = this->balance_into_new_nodes(
*this,

View File

@ -85,8 +85,10 @@ Cache::retire_extent_ret Cache::retire_extent_addr(
// add a new placeholder to Cache
ext = CachedExtent::make_cached_extent_ref<
RetiredExtentPlaceholder>(length);
ext->set_paddr(addr);
ext->state = CachedExtent::extent_state_t::CLEAN;
ext->init(CachedExtent::extent_state_t::CLEAN,
addr,
placement_hint_t::NUM_HINTS,
NULL_GENERATION);
DEBUGT("retire {}~{} as placeholder, add extent -- {}",
t, addr, length, *ext);
add_extent(ext);
@ -924,40 +926,41 @@ void Cache::on_transaction_destruct(Transaction& t)
}
CachedExtentRef Cache::alloc_new_extent_by_type(
Transaction &t, ///< [in, out] current transaction
extent_types_t type, ///< [in] type tag
Transaction &t, ///< [in, out] current transaction
extent_types_t type, ///< [in] type tag
seastore_off_t length, ///< [in] length
placement_hint_t hint
placement_hint_t hint, ///< [in] user hint
reclaim_gen_t gen ///< [in] reclaim generation
)
{
LOG_PREFIX(Cache::alloc_new_extent_by_type);
SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}",
t, type, length, hint);
SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}",
t, type, length, hint, reclaim_gen_printer_t{gen});
switch (type) {
case extent_types_t::ROOT:
ceph_assert(0 == "ROOT is never directly alloc'd");
return CachedExtentRef();
case extent_types_t::LADDR_INTERNAL:
return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length, hint);
return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length, hint, gen);
case extent_types_t::LADDR_LEAF:
return alloc_new_extent<lba_manager::btree::LBALeafNode>(t, length, hint);
return alloc_new_extent<lba_manager::btree::LBALeafNode>(t, length, hint, gen);
case extent_types_t::ONODE_BLOCK_STAGED:
return alloc_new_extent<onode::SeastoreNodeExtent>(t, length, hint);
return alloc_new_extent<onode::SeastoreNodeExtent>(t, length, hint, gen);
case extent_types_t::OMAP_INNER:
return alloc_new_extent<omap_manager::OMapInnerNode>(t, length, hint);
return alloc_new_extent<omap_manager::OMapInnerNode>(t, length, hint, gen);
case extent_types_t::OMAP_LEAF:
return alloc_new_extent<omap_manager::OMapLeafNode>(t, length, hint);
return alloc_new_extent<omap_manager::OMapLeafNode>(t, length, hint, gen);
case extent_types_t::COLL_BLOCK:
return alloc_new_extent<collection_manager::CollectionNode>(t, length, hint);
return alloc_new_extent<collection_manager::CollectionNode>(t, length, hint, gen);
case extent_types_t::OBJECT_DATA_BLOCK:
return alloc_new_extent<ObjectDataBlock>(t, length, hint);
return alloc_new_extent<ObjectDataBlock>(t, length, hint, gen);
case extent_types_t::RETIRED_PLACEHOLDER:
ceph_assert(0 == "impossible");
return CachedExtentRef();
case extent_types_t::TEST_BLOCK:
return alloc_new_extent<TestBlock>(t, length, hint);
return alloc_new_extent<TestBlock>(t, length, hint, gen);
case extent_types_t::TEST_BLOCK_PHYSICAL:
return alloc_new_extent<TestBlockPhysical>(t, length, hint);
return alloc_new_extent<TestBlockPhysical>(t, length, hint, gen);
case extent_types_t::NONE: {
ceph_assert(0 == "NONE is an invalid extent type");
return CachedExtentRef();
@ -986,6 +989,7 @@ CachedExtentRef Cache::duplicate_for_write(
ret->version++;
ret->state = CachedExtent::extent_state_t::MUTATION_PENDING;
ret->set_reclaim_generation(DIRTY_GENERATION);
DEBUGT("{} -> {}", t, *i, *ret);
return ret;
}

View File

@ -303,8 +303,10 @@ public:
if (!cached) {
auto ret = CachedExtent::make_cached_extent_ref<T>(
alloc_cache_buf(length));
ret->set_paddr(offset);
ret->state = CachedExtent::extent_state_t::CLEAN_PENDING;
ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
offset,
placement_hint_t::NUM_HINTS,
NULL_GENERATION);
SUBDEBUG(seastore_cache,
"{} {}~{} is absent, add extent and reading ... -- {}",
T::TYPE, offset, length, *ret);
@ -319,8 +321,10 @@ public:
if (cached->get_type() == extent_types_t::RETIRED_PLACEHOLDER) {
auto ret = CachedExtent::make_cached_extent_ref<T>(
alloc_cache_buf(length));
ret->set_paddr(offset);
ret->state = CachedExtent::extent_state_t::CLEAN_PENDING;
ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
offset,
placement_hint_t::NUM_HINTS,
NULL_GENERATION);
SUBDEBUG(seastore_cache,
"{} {}~{} is absent(placeholder), reading ... -- {}",
T::TYPE, offset, length, *ret);
@ -681,19 +685,23 @@ public:
TCachedExtentRef<T> alloc_new_extent(
Transaction &t, ///< [in, out] current transaction
seastore_off_t length, ///< [in] length
placement_hint_t hint = placement_hint_t::HOT
placement_hint_t hint, ///< [in] user hint
reclaim_gen_t gen ///< [in] reclaim generation
) {
LOG_PREFIX(Cache::alloc_new_extent);
SUBTRACET(seastore_cache, "allocate {} {}B, hint={}",
t, T::TYPE, length, hint);
auto result = epm.alloc_new_extent(t, T::TYPE, length, hint);
SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}",
t, T::TYPE, length, hint, reclaim_gen_printer_t{gen});
auto result = epm.alloc_new_extent(t, T::TYPE, length, hint, gen);
auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result.bp));
ret->set_paddr(result.paddr);
ret->hint = hint;
ret->state = CachedExtent::extent_state_t::INITIAL_WRITE_PENDING;
ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING,
result.paddr,
hint,
result.gen);
t.add_fresh_extent(ret);
SUBDEBUGT(seastore_cache, "allocated {} {}B extent at {}, hint={} -- {}",
t, T::TYPE, length, result.paddr, hint, *ret);
SUBDEBUGT(seastore_cache,
"allocated {} {}B extent at {}, hint={}, gen={} -- {}",
t, T::TYPE, length, result.paddr,
hint, reclaim_gen_printer_t{result.gen}, *ret);
return ret;
}
@ -703,10 +711,11 @@ public:
* Allocates a fresh extent. addr will be relative until commit.
*/
CachedExtentRef alloc_new_extent_by_type(
Transaction &t, ///< [in, out] current transaction
extent_types_t type, ///< [in] type tag
Transaction &t, ///< [in, out] current transaction
extent_types_t type, ///< [in] type tag
seastore_off_t length, ///< [in] length
placement_hint_t hint = placement_hint_t::HOT
placement_hint_t hint, ///< [in] user hint
reclaim_gen_t gen ///< [in] reclaim generation
);
/**

View File

@ -105,7 +105,17 @@ class CachedExtent : public boost::intrusive_ref_counter<
// time of the last rewrite
seastar::lowres_system_clock::time_point last_rewritten;
public:
void init(extent_state_t _state,
paddr_t paddr,
placement_hint_t hint,
reclaim_gen_t gen) {
state = _state;
set_paddr(paddr);
user_hint = hint;
reclaim_generation = gen;
}
void set_last_modified(seastar::lowres_system_clock::duration d) {
last_modified = seastar::lowres_system_clock::time_point(d);
@ -209,7 +219,9 @@ public:
<< ", length=" << get_length()
<< ", state=" << state
<< ", last_committed_crc=" << last_committed_crc
<< ", refcount=" << use_count();
<< ", refcount=" << use_count()
<< ", user_hint=" << user_hint
<< ", reclaim_gen=" << reclaim_generation;
if (state != extent_state_t::INVALID &&
state != extent_state_t::CLEAN_PENDING) {
print_detail(out);
@ -374,8 +386,24 @@ public:
virtual ~CachedExtent();
/// hint for allocators
placement_hint_t hint = placement_hint_t::NUM_HINTS;
placement_hint_t get_user_hint() const {
return user_hint;
}
reclaim_gen_t get_reclaim_generation() const {
return reclaim_generation;
}
void invalidate_hints() {
user_hint = placement_hint_t::NUM_HINTS;
reclaim_generation = NULL_GENERATION;
}
void set_reclaim_generation(reclaim_gen_t gen) {
assert(gen < RECLAIM_GENERATIONS);
user_hint = placement_hint_t::REWRITE;
reclaim_generation = gen;
}
bool is_inline() const {
return poffset.is_relative();
@ -454,6 +482,11 @@ private:
read_set_item_t<Transaction>::list transactions;
placement_hint_t user_hint;
/// > 0 and not null means the extent is under reclaimming
reclaim_gen_t reclaim_generation;
protected:
CachedExtent(CachedExtent &&other) = delete;
CachedExtent(ceph::bufferptr &&ptr) : ptr(std::move(ptr)) {}

View File

@ -10,10 +10,11 @@ SET_SUBSYS(seastore_journal);
namespace crimson::os::seastore {
SegmentedOolWriter::SegmentedOolWriter(
std::string name,
data_category_t category,
reclaim_gen_t gen,
SegmentProvider& sp,
SegmentSeqAllocator &ssa)
: segment_allocator(name, segment_type_t::OOL, sp, ssa),
: segment_allocator(segment_type_t::OOL, category, gen, sp, ssa),
record_submitter(crimson::common::get_conf<uint64_t>(
"seastore_journal_iodepth_limit"),
crimson::common::get_conf<uint64_t>(
@ -55,7 +56,7 @@ SegmentedOolWriter::write_record(
TRACET("{} ool extent written at {} -- {}",
t, segment_allocator.get_name(),
extent_addr, *extent);
extent->hint = placement_hint_t::NUM_HINTS; // invalidate hint
extent->invalidate_hints();
t.mark_delayed_extent_ool(extent, extent_addr);
extent_addr = extent_addr.as_seg_paddr().add_offset(
extent->get_length());

View File

@ -48,7 +48,8 @@ class SegmentProvider;
*/
class SegmentedOolWriter : public ExtentOolWriter {
public:
SegmentedOolWriter(std::string name,
SegmentedOolWriter(data_category_t category,
reclaim_gen_t gen,
SegmentProvider &sp,
SegmentSeqAllocator &ssa);
@ -85,26 +86,29 @@ private:
class ExtentPlacementManager {
public:
ExtentPlacementManager() {
ExtentPlacementManager(bool prefer_ool)
: prefer_ool{prefer_ool} {
devices_by_id.resize(DEVICE_ID_GLOBAL_MAX, nullptr);
}
void init_ool_writers(SegmentProvider &sp, SegmentSeqAllocator &ssa) {
// Currently only one SegmentProvider is supported, so hardcode the
// writers_by_hint for now.
writer_seed = 0;
// Currently only one SegmentProvider is supported
writer_refs.clear();
writers_by_hint.resize((std::size_t)placement_hint_t::NUM_HINTS, {});
// ool writer is not supported for placement_hint_t::HOT
writer_refs.emplace_back(
std::make_unique<SegmentedOolWriter>("COLD", sp, ssa));
writers_by_hint[(std::size_t)placement_hint_t::COLD
].emplace_back(writer_refs.back().get());
writer_refs.emplace_back(
std::make_unique<SegmentedOolWriter>("REWRITE", sp, ssa));
writers_by_hint[(std::size_t)placement_hint_t::REWRITE
].emplace_back(writer_refs.back().get());
ceph_assert(RECLAIM_GENERATIONS > 0);
data_writers_by_gen.resize(RECLAIM_GENERATIONS, {});
for (reclaim_gen_t gen = 0; gen < RECLAIM_GENERATIONS; ++gen) {
writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
data_category_t::DATA, gen, sp, ssa));
data_writers_by_gen[gen] = writer_refs.back().get();
}
md_writers_by_gen.resize(RECLAIM_GENERATIONS - 1, {});
for (reclaim_gen_t gen = 1; gen < RECLAIM_GENERATIONS; ++gen) {
writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
data_category_t::METADATA, gen, sp, ssa));
md_writers_by_gen[gen - 1] = writer_refs.back().get();
}
}
void add_device(Device* device, bool is_primary) {
@ -132,8 +136,10 @@ public:
open_ertr::future<> open() {
LOG_PREFIX(ExtentPlacementManager::open);
SUBINFO(seastore_journal, "started");
return crimson::do_for_each(writers_by_hint, [](auto& writers) {
return crimson::do_for_each(writers, [](auto& writer) {
return crimson::do_for_each(data_writers_by_gen, [](auto &writer) {
return writer->open();
}).safe_then([this] {
return crimson::do_for_each(md_writers_by_gen, [](auto &writer) {
return writer->open();
});
});
@ -142,14 +148,18 @@ public:
struct alloc_result_t {
paddr_t paddr;
bufferptr bp;
reclaim_gen_t gen;
};
alloc_result_t alloc_new_extent(
Transaction& t,
extent_types_t type,
seastore_off_t length,
placement_hint_t hint
placement_hint_t hint,
reclaim_gen_t gen
) {
assert(hint < placement_hint_t::NUM_HINTS);
assert(gen < RECLAIM_GENERATIONS);
assert(gen == 0 || hint == placement_hint_t::REWRITE);
// XXX: bp might be extended to point to differnt memory (e.g. PMem)
// according to the allocator.
@ -160,19 +170,35 @@ public:
if (!is_logical_type(type)) {
// TODO: implement out-of-line strategy for physical extent.
return {make_record_relative_paddr(0),
std::move(bp)};
std::move(bp),
0};
}
// FIXME: set delay for COLD extent and improve GC
// NOTE: delay means to delay the decision about whether to write the
// extent as inline or out-of-line extents.
bool delay = (hint > placement_hint_t::COLD);
if (delay) {
if (hint == placement_hint_t::COLD) {
assert(gen == 0);
return {make_delayed_temp_paddr(0),
std::move(bp)};
std::move(bp),
COLD_GENERATION};
}
if (get_extent_category(type) == data_category_t::METADATA &&
gen == 0) {
// gen 0 METADATA writer is the journal writer
if (prefer_ool) {
return {make_delayed_temp_paddr(0),
std::move(bp),
1};
} else {
return {make_record_relative_paddr(0),
std::move(bp),
0};
}
} else {
return {make_record_relative_paddr(0),
std::move(bp)};
assert(get_extent_category(type) == data_category_t::DATA ||
gen > 0);
return {make_delayed_temp_paddr(0),
std::move(bp),
gen};
}
}
@ -193,7 +219,10 @@ public:
[this, &t, &delayed_extents](auto& alloc_map) {
for (auto& extent : delayed_extents) {
// For now, just do ool allocation for any delayed extent
auto writer_ptr = get_writer(extent->hint);
auto writer_ptr = get_writer(
extent->get_user_hint(),
get_extent_category(extent->get_type()),
extent->get_reclaim_generation());
alloc_map[writer_ptr].emplace_back(extent);
}
return trans_intr::do_for_each(alloc_map, [&t](auto& p) {
@ -208,8 +237,10 @@ public:
close_ertr::future<> close() {
LOG_PREFIX(ExtentPlacementManager::close);
SUBINFO(seastore_journal, "started");
return crimson::do_for_each(writers_by_hint, [](auto& writers) {
return crimson::do_for_each(writers, [](auto& writer) {
return crimson::do_for_each(data_writers_by_gen, [](auto &writer) {
return writer->close();
}).safe_then([this] {
return crimson::do_for_each(md_writers_by_gen, [](auto &writer) {
return writer->close();
});
}).safe_then([this] {
@ -230,18 +261,27 @@ public:
}
private:
ExtentOolWriter* get_writer(placement_hint_t hint) {
ExtentOolWriter* get_writer(placement_hint_t hint,
data_category_t category,
reclaim_gen_t gen) {
assert(hint < placement_hint_t::NUM_HINTS);
auto hint_index = static_cast<std::size_t>(hint);
assert(hint_index < writers_by_hint.size());
auto& writers = writers_by_hint[hint_index];
assert(writers.size() > 0);
return writers[writer_seed++ % writers.size()];
assert(gen < RECLAIM_GENERATIONS);
if (category == data_category_t::DATA) {
return data_writers_by_gen[gen];
} else {
assert(category == data_category_t::METADATA);
// gen 0 METADATA writer is the journal writer
assert(gen > 0);
return md_writers_by_gen[gen - 1];
}
}
std::size_t writer_seed = 0;
bool prefer_ool;
std::vector<ExtentOolWriterRef> writer_refs;
std::vector<std::vector<ExtentOolWriter*>> writers_by_hint;
std::vector<ExtentOolWriter*> data_writers_by_gen;
// gen 0 METADATA writer is the journal writer
std::vector<ExtentOolWriter*> md_writers_by_gen;
std::vector<Device*> devices_by_id;
Device* primary_device = nullptr;
};

View File

@ -13,13 +13,15 @@ SET_SUBSYS(seastore_journal);
namespace crimson::os::seastore::journal {
SegmentAllocator::SegmentAllocator(
std::string name,
segment_type_t type,
data_category_t category,
reclaim_gen_t gen,
SegmentProvider &sp,
SegmentSeqAllocator &ssa)
: name{name},
print_name{fmt::format("D?_{}", name)},
: print_name{fmt::format("{}_G{}", category, gen)},
type{type},
category{category},
gen{gen},
segment_provider{sp},
sm_group{*sp.get_segment_manager_group()},
segment_seq_allocator(ssa)
@ -40,7 +42,8 @@ SegmentAllocator::do_open()
new_segment_seq,
reinterpret_cast<const unsigned char *>(meta.seastore_id.bytes()),
sizeof(meta.seastore_id.uuid));
auto new_segment_id = segment_provider.allocate_segment(new_segment_seq, type);
auto new_segment_id = segment_provider.allocate_segment(
new_segment_seq, type, category, gen);
ceph_assert(new_segment_id != NULL_SEG_ID);
return sm_group.open(new_segment_id
).handle_error(
@ -66,7 +69,9 @@ SegmentAllocator::do_open()
new_journal_tail,
new_alloc_replay_from,
current_segment_nonce,
type};
type,
category,
gen};
INFO("{} writing header to new segment ... -- {}",
print_name, header);
@ -124,7 +129,8 @@ SegmentAllocator::open()
for (auto& device_id : device_ids) {
oss << "_" << device_id_printer_t{device_id};
}
oss << "_" << name;
oss << "_"
<< fmt::format("{}_G{}", category, gen);
print_name = oss.str();
INFO("{}", print_name);

View File

@ -30,8 +30,9 @@ class SegmentAllocator {
crimson::ct_error::input_output_error>;
public:
SegmentAllocator(std::string name,
segment_type_t type,
SegmentAllocator(segment_type_t type,
data_category_t category,
reclaim_gen_t gen,
SegmentProvider &sp,
SegmentSeqAllocator &ssa);
@ -111,11 +112,12 @@ class SegmentAllocator {
using close_segment_ertr = base_ertr;
close_segment_ertr::future<> close_segment();
const std::string name;
// device id is not available during construction,
// so generate the print_name later.
std::string print_name;
const segment_type_t type; // JOURNAL or OOL
const data_category_t category;
const reclaim_gen_t gen;
SegmentProvider &segment_provider;
SegmentManagerGroup &sm_group;
SegmentRef current_segment;

View File

@ -31,8 +31,9 @@ SegmentedJournal::SegmentedJournal(
: segment_provider(segment_provider),
segment_seq_allocator(
new SegmentSeqAllocator(segment_type_t::JOURNAL)),
journal_segment_allocator("JOURNAL",
segment_type_t::JOURNAL,
journal_segment_allocator(segment_type_t::JOURNAL,
data_category_t::METADATA,
0, // generation
segment_provider,
*segment_seq_allocator),
record_submitter(crimson::common::get_conf<uint64_t>(

View File

@ -173,6 +173,29 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t)
}
}
std::ostream &operator<<(std::ostream &out, reclaim_gen_printer_t gen)
{
if (gen.gen == NULL_GENERATION) {
return out << "NULL_GEN";
} else if (gen.gen >= RECLAIM_GENERATIONS) {
return out << "INVALID_GEN(" << (unsigned)gen.gen << ")";
} else {
return out << "GEN(" << (unsigned)gen.gen << ")";
}
}
std::ostream &operator<<(std::ostream &out, data_category_t c)
{
switch (c) {
case data_category_t::METADATA:
return out << "MD";
case data_category_t::DATA:
return out << "DATA";
default:
return out << "INVALID_CATEGORY!";
}
}
std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs)
{
bool first = false;
@ -224,6 +247,8 @@ std::ostream &operator<<(std::ostream &out, const segment_header_t &header)
<< ", journal_tail=" << header.journal_tail
<< ", segment_nonce=" << header.segment_nonce
<< ", type=" << header.type
<< ", category=" << header.category
<< ", generaton=" << (unsigned)header.generation
<< ")";
}

View File

@ -757,9 +757,9 @@ constexpr objaddr_t OBJ_ADDR_MAX = std::numeric_limits<objaddr_t>::max();
constexpr objaddr_t OBJ_ADDR_NULL = OBJ_ADDR_MAX;
enum class placement_hint_t {
HOT = 0, // Most of the metadata
COLD, // Object data
REWRITE, // Cold metadata and data (probably need further splits)
HOT = 0, // The default user hint that expects mutations or retirement
COLD, // Expect no mutations and no retirement in the near future
REWRITE, // Hint for the internal rewrites
NUM_HINTS // Constant for number of hints
};
@ -973,6 +973,37 @@ constexpr bool is_backref_node(extent_types_t type)
std::ostream &operator<<(std::ostream &out, extent_types_t t);
using reclaim_gen_t = uint8_t;
constexpr reclaim_gen_t DIRTY_GENERATION = 1;
constexpr reclaim_gen_t COLD_GENERATION = 1;
constexpr reclaim_gen_t RECLAIM_GENERATIONS = 3;
constexpr reclaim_gen_t NULL_GENERATION =
std::numeric_limits<reclaim_gen_t>::max();
struct reclaim_gen_printer_t {
reclaim_gen_t gen;
};
std::ostream &operator<<(std::ostream &out, reclaim_gen_printer_t gen);
enum class data_category_t : uint8_t {
METADATA = 0,
DATA,
NUM
};
std::ostream &operator<<(std::ostream &out, data_category_t c);
constexpr data_category_t get_extent_category(extent_types_t type) {
if (type == extent_types_t::OBJECT_DATA_BLOCK ||
type == extent_types_t::COLL_BLOCK) {
return data_category_t::DATA;
} else {
return data_category_t::METADATA;
}
}
enum class record_commit_type_t : uint8_t {
NONE,
MODIFY,
@ -1419,6 +1450,9 @@ struct segment_header_t {
segment_type_t type;
data_category_t category;
reclaim_gen_t generation;
segment_type_t get_type() const {
return type;
}
@ -1431,6 +1465,8 @@ struct segment_header_t {
denc(v.alloc_replay_from, p);
denc(v.segment_nonce, p);
denc(v.type, p);
denc(v.category, p);
denc(v.generation, p);
DENC_FINISH(p);
}
};

View File

@ -28,16 +28,14 @@ TransactionManager::TransactionManager(
CacheRef _cache,
LBAManagerRef _lba_manager,
ExtentPlacementManagerRef &&epm,
BackrefManagerRef&& backref_manager,
tm_make_config_t config)
BackrefManagerRef&& backref_manager)
: async_cleaner(std::move(_async_cleaner)),
cache(std::move(_cache)),
lba_manager(std::move(_lba_manager)),
journal(std::move(_journal)),
epm(std::move(epm)),
backref_manager(std::move(backref_manager)),
sm_group(*async_cleaner->get_segment_manager_group()),
config(config)
sm_group(*async_cleaner->get_segment_manager_group())
{
async_cleaner->set_extent_callback(this);
journal->set_write_pipeline(&write_pipeline);
@ -473,7 +471,8 @@ TransactionManager::rewrite_logical_extent(
t,
lextent->get_type(),
lextent->get_length(),
placement_hint_t::REWRITE)->cast<LogicalCachedExtent>();
lextent->get_user_hint(),
lextent->get_reclaim_generation())->cast<LogicalCachedExtent>();
lextent->get_bptr().copy_out(
0,
lextent->get_length(),
@ -497,7 +496,8 @@ TransactionManager::rewrite_logical_extent(
TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
Transaction &t,
CachedExtentRef extent)
CachedExtentRef extent,
reclaim_gen_t target_generation)
{
LOG_PREFIX(TransactionManager::rewrite_extent);
@ -511,6 +511,13 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
ceph_assert(!extent->is_pending_io());
}
assert(extent->is_valid() && !extent->is_initial_pending());
if (extent->is_dirty()) {
extent->set_reclaim_generation(DIRTY_GENERATION);
} else {
extent->set_reclaim_generation(target_generation);
}
t.get_rewrite_version_stats().increment(extent->get_version());
if (is_backref_node(extent->get_type())) {
@ -640,7 +647,7 @@ TransactionManager::~TransactionManager() {}
TransactionManagerRef make_transaction_manager(tm_make_config_t config)
{
LOG_PREFIX(make_transaction_manager);
auto epm = std::make_unique<ExtentPlacementManager>();
auto epm = std::make_unique<ExtentPlacementManager>(config.epm_prefer_ool);
auto cache = std::make_unique<Cache>(*epm);
auto lba_manager = lba_manager::create_lba_manager(*cache);
auto sms = std::make_unique<SegmentManagerGroup>();
@ -681,8 +688,7 @@ TransactionManagerRef make_transaction_manager(tm_make_config_t config)
std::move(cache),
std::move(lba_manager),
std::move(epm),
std::move(backref_manager),
config);
std::move(backref_manager));
}
}

View File

@ -35,15 +35,16 @@ namespace crimson::os::seastore {
class Journal;
struct tm_make_config_t {
bool is_test = true;
journal_type_t j_type = journal_type_t::SEGMENT_JOURNAL;
placement_hint_t default_placement_hint = placement_hint_t::HOT;
bool is_test;
journal_type_t j_type;
bool epm_prefer_ool;
reclaim_gen_t default_generation;
static tm_make_config_t get_default() {
return tm_make_config_t {
false,
journal_type_t::SEGMENT_JOURNAL,
placement_hint_t::HOT
false
};
}
static tm_make_config_t get_test_segmented_journal() {
@ -52,7 +53,7 @@ struct tm_make_config_t {
return tm_make_config_t {
true,
journal_type_t::SEGMENT_JOURNAL,
placement_hint_t::HOT
false
};
}
static tm_make_config_t get_test_cb_journal() {
@ -61,7 +62,7 @@ struct tm_make_config_t {
return tm_make_config_t {
true,
journal_type_t::CIRCULARBOUNDED_JOURNAL,
placement_hint_t::REWRITE
true
};
}
@ -71,9 +72,9 @@ private:
tm_make_config_t(
bool is_test,
journal_type_t j_type,
placement_hint_t default_placement_hint)
bool epm_prefer_ool)
: is_test(is_test), j_type(j_type),
default_placement_hint(default_placement_hint)
epm_prefer_ool(epm_prefer_ool)
{}
};
@ -114,8 +115,7 @@ public:
CacheRef cache,
LBAManagerRef lba_manager,
ExtentPlacementManagerRef &&epm,
BackrefManagerRef&& backref_manager,
tm_make_config_t config = tm_make_config_t::get_default());
BackrefManagerRef&& backref_manager);
/// Writes initial metadata to disk
using mkfs_ertr = base_ertr;
@ -338,14 +338,8 @@ public:
alloc_extent_ret<T> alloc_extent(
Transaction &t,
laddr_t laddr_hint,
extent_len_t len) {
placement_hint_t placement_hint;
if constexpr (T::TYPE == extent_types_t::OBJECT_DATA_BLOCK ||
T::TYPE == extent_types_t::COLL_BLOCK) {
placement_hint = placement_hint_t::COLD;
} else {
placement_hint = config.default_placement_hint;
}
extent_len_t len,
placement_hint_t placement_hint = placement_hint_t::HOT) {
LOG_PREFIX(TransactionManager::alloc_extent);
SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
t, T::TYPE, len, placement_hint, laddr_hint);
@ -353,7 +347,8 @@ public:
auto ext = cache->alloc_new_extent<T>(
t,
len,
placement_hint);
placement_hint,
0);
return lba_manager->alloc_extent(
t,
laddr_hint,
@ -447,7 +442,8 @@ public:
using AsyncCleaner::ExtentCallbackInterface::rewrite_extent_ret;
rewrite_extent_ret rewrite_extent(
Transaction &t,
CachedExtentRef extent) final;
CachedExtentRef extent,
reclaim_gen_t target_generation) final;
using AsyncCleaner::ExtentCallbackInterface::get_extent_if_live_ret;
get_extent_if_live_ret get_extent_if_live(
@ -608,10 +604,10 @@ private:
WritePipeline write_pipeline;
tm_make_config_t config;
rewrite_extent_ret rewrite_logical_extent(
Transaction& t,
LogicalCachedExtentRef extent);
public:
// Testing interfaces
auto get_async_cleaner() {

View File

@ -60,7 +60,9 @@ struct btree_test_base :
segment_id_t allocate_segment(
segment_seq_t seq,
segment_type_t type
segment_type_t type,
data_category_t,
reclaim_gen_t
) final {
auto ret = next;
next = segment_id_t{
@ -111,7 +113,7 @@ struct btree_test_base :
}).safe_then([this] {
sms.reset(new SegmentManagerGroup());
journal = journal::make_segmented(*this);
epm.reset(new ExtentPlacementManager());
epm.reset(new ExtentPlacementManager(false));
cache.reset(new Cache(*epm));
block_size = segment_manager->get_block_size();
@ -368,7 +370,11 @@ struct btree_lba_manager_test : btree_test_base {
test_lba_mappings
};
if (create_fake_extent) {
cache->alloc_new_extent<TestBlockPhysical>(*t.t, TestBlockPhysical::SIZE);
cache->alloc_new_extent<TestBlockPhysical>(
*t.t,
TestBlockPhysical::SIZE,
placement_hint_t::HOT,
0);
};
return t;
}

View File

@ -135,7 +135,7 @@ struct cbjournal_test_t : public seastar_test_suite_t
cbjournal_test_t() :
segment_manager(segment_manager::create_test_ephemeral()),
epm(new ExtentPlacementManager()),
epm(new ExtentPlacementManager(true)),
cache(*epm)
{
device = new nvme_device::TestMemory(CBTEST_DEFAULT_TEST_SIZE);

View File

@ -88,7 +88,7 @@ struct cache_test_t : public seastar_test_suite_t {
return segment_manager->mkfs(
segment_manager::get_ephemeral_device_config(0, 1));
}).safe_then([this] {
epm.reset(new ExtentPlacementManager());
epm.reset(new ExtentPlacementManager(false));
cache.reset(new Cache(*epm));
current = paddr_t::make_seg_paddr(segment_id_t(segment_manager->get_device_id(), 0), 0);
epm->add_device(segment_manager.get(), true);
@ -131,7 +131,9 @@ TEST_F(cache_test_t, test_addr_fixup)
auto t = get_transaction();
auto extent = cache->alloc_new_extent<TestBlockPhysical>(
*t,
TestBlockPhysical::SIZE);
TestBlockPhysical::SIZE,
placement_hint_t::HOT,
0);
extent->set_contents('c');
csum = extent->get_crc32c();
submit_transaction(std::move(t)).get0();
@ -160,7 +162,9 @@ TEST_F(cache_test_t, test_dirty_extent)
auto t = get_transaction();
auto extent = cache->alloc_new_extent<TestBlockPhysical>(
*t,
TestBlockPhysical::SIZE);
TestBlockPhysical::SIZE,
placement_hint_t::HOT,
0);
extent->set_contents('c');
csum = extent->get_crc32c();
auto reladdr = extent->get_paddr();

View File

@ -109,7 +109,9 @@ struct journal_test_t : seastar_test_suite_t, SegmentProvider {
segment_id_t allocate_segment(
segment_seq_t seq,
segment_type_t type
segment_type_t type,
data_category_t,
reclaim_gen_t
) final {
auto ret = next;
next = segment_id_t{