Merge pull request #54504 from aclamk/wip-aclamk-bs-refactor-write-path

os/bluestore: Recompression, part 2. New write path.
This commit is contained in:
Adam Kupczyk 2024-08-13 15:15:50 +02:00 committed by GitHub
commit a787a91719
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 3283 additions and 33 deletions

0
qa/objectstore_debug/% Normal file
View File

View File

@ -0,0 +1,5 @@
overrides:
ceph:
conf:
osd:
bluestore write v2 random: true

View File

@ -0,0 +1,5 @@
overrides:
ceph:
conf:
osd:
bluestore write v2: false

View File

@ -0,0 +1,5 @@
overrides:
ceph:
conf:
osd:
bluestore write v2: true

View File

@ -1 +1 @@
.qa/objectstore_debug/bluestore-bitmap.yaml
.qa/objectstore_debug/bluestore/bluestore-bitmap.yaml

View File

@ -1 +1 @@
.qa/objectstore_debug/bluestore-comp-lz4.yaml
.qa/objectstore_debug/bluestore/bluestore-comp-lz4.yaml

View File

@ -1 +1 @@
.qa/objectstore_debug/bluestore-comp-snappy.yaml
.qa/objectstore_debug/bluestore/bluestore-comp-snappy.yaml

View File

@ -1 +1 @@
../thrash-erasure-code/objectstore/bluestore-bitmap.yaml
../thrash-erasure-code/objectstore/bluestore/bluestore-bitmap.yaml

View File

@ -1 +1 @@
.qa/objectstore_debug/bluestore-bitmap.yaml
.qa/objectstore_debug/bluestore/bluestore-bitmap.yaml

View File

@ -5042,6 +5042,29 @@ options:
flags:
- create
with_legacy: false
- name: bluestore_write_v2
type: bool
level: advanced
desc: Use faster write path
long_desc: Original write path was developed over long time by constantly adding features.
The price was layered inefficiencies gained along the way.
Rework of write path done from scratch clears it and optimizes for typical cases.
Write_v2 is necessary for recompression feature.
default: false
flags:
- startup
with_legacy: false
- name: bluestore_write_v2_random
type: bool
level: advanced
desc: Random selection of write path mode
long_desc: For testing purposes. If true, value of bluestore_write_v2 is randomly selected.
default: false
see_also:
- bluestore_write_v2
flags:
- startup
with_legacy: false
- name: bluestore_allocator
type: str
level: advanced

View File

@ -63,6 +63,8 @@ set(alien_store_srcs
${PROJECT_SOURCE_DIR}/src/os/bluestore/HybridAllocator.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/StupidAllocator.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapAllocator.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/Writer.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore_debug.cc
${PROJECT_SOURCE_DIR}/src/os/memstore/MemStore.cc)
add_library(crimson-alienstore STATIC
${alien_store_srcs})

View File

@ -90,6 +90,17 @@ constexpr inline T p2nphase(T x, T align) {
return -x & (align - 1);
}
/*
* return how much space is left in this block,
* when perfectly aligned, return whole block
* eg, p2remain(0x1234, 0x100) == 0xcc
* eg, p2remain(0x5600, 0x100) == 0x100
*/
template<typename T>
constexpr inline T p2remain(T x, T align) {
return align - p2phase(x, align);
}
/*
* return x rounded up to an align boundary
* eg, p2roundup(0x1234, 0x100) == 0x1300 (0x13*align)

View File

@ -26,6 +26,7 @@ if(WITH_BLUESTORE)
bluestore/BtreeAllocator.cc
bluestore/Btree2Allocator.cc
bluestore/HybridAllocator.cc
bluestore/Writer.cc
)
endif(WITH_BLUESTORE)

View File

@ -54,6 +54,7 @@
#include "common/pretty_binary.h"
#include "common/WorkQueue.h"
#include "kv/KeyValueHistogram.h"
#include "Writer.h"
#if defined(WITH_LTTNG)
#define TRACEPOINT_DEFINE
@ -4299,6 +4300,42 @@ BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
return fp;
}
// Split extent at desired offset.
// Returns iterator to the right part.
BlueStore::extent_map_t::iterator BlueStore::ExtentMap::split_at(
BlueStore::extent_map_t::iterator p, uint32_t offset)
{
ceph_assert(p != extent_map.end());
ceph_assert(p->logical_offset < offset);
ceph_assert(offset < p->logical_end());
add(offset, p->blob_offset + (offset - p->logical_offset),
p->logical_end() - offset, p->blob);
p->length = offset - p->logical_offset;
++p;
return p;
}
// If inside extent split it, and return right part.
// If not inside extent return extent on right.
BlueStore::extent_map_t::iterator BlueStore::ExtentMap::maybe_split_at(uint32_t offset)
{
auto p = seek_lextent(offset);
if (p != extent_map.end()) {
if (p->logical_offset < offset && offset < p->logical_end()) {
// need to split
add(offset, p->blob_offset + (offset - p->logical_offset),
p->logical_end() - offset, p->blob);
p->length = offset - p->logical_offset;
++p;
// check that we moved to proper extent
ceph_assert(p->logical_offset == offset);
} else {
// the extent is either outside offset or exactly at
}
}
return p;
}
bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
{
auto fp = seek_lextent(offset);
@ -6219,6 +6256,9 @@ void BlueStore::_init_logger()
// write op stats
//****************************************
b.add_time_avg(l_bluestore_write_lat, "write_lat",
"write_op average execution time",
"aw", PerfCountersBuilder::PRIO_USEFUL);
b.add_u64_counter(l_bluestore_write_big, "write_big",
"Large aligned writes into fresh blobs");
b.add_u64_counter(l_bluestore_write_big_bytes, "write_big_bytes",
@ -9109,7 +9149,12 @@ int BlueStore::_mount()
return r;
}
}
use_write_v2 = cct->_conf.get_val<bool>("bluestore_write_v2");
if (cct->_conf.get_val<bool>("bluestore_write_v2_random")) {
srand(time(NULL));
use_write_v2 = rand() % 2;
cct->_conf.set_val("bluestore_write_v2", std::to_string(use_write_v2));
}
_kv_only = false;
if (cct->_conf->bluestore_fsck_on_mount) {
int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
@ -16640,18 +16685,7 @@ int BlueStore::_do_alloc_write(
}
// checksum
int64_t csum = csum_type.load();
csum = select_option(
"csum_type",
csum,
[&]() {
int64_t val;
if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
return std::optional<int64_t>(val);
}
return std::optional<int64_t>();
}
);
int64_t csum = wctx->csum_type;
// compress (as needed) and calc needed space
uint64_t need = 0;
@ -17055,6 +17089,21 @@ void BlueStore::_choose_write_options(
// apply basic csum block size
wctx->csum_order = block_size_order;
// checksum
int64_t csum = csum_type.load();
csum = select_option(
"csum_type",
csum,
[&]() {
int64_t val;
if (c->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
return std::optional<int64_t>(val);
}
return std::optional<int64_t>();
}
);
wctx->csum_type = csum;
// compression parameters
unsigned alloc_hints = o->onode.alloc_hint_flags;
auto cm = select_option(
@ -17291,6 +17340,51 @@ int BlueStore::_do_write(
return r;
}
int BlueStore::_do_write_v2(
TransContext *txc,
CollectionRef& c,
OnodeRef& o,
uint64_t offset,
uint64_t length,
bufferlist& bl,
uint32_t fadvise_flags)
{
int r = 0;
dout(20) << __func__
<< " " << o->oid
<< " 0x" << std::hex << offset << "~" << length
<< " - have 0x" << o->onode.size
<< " (" << std::dec << o->onode.size << ")"
<< " bytes" << std::hex
<< " fadvise_flags 0x" << fadvise_flags
<< " alloc_hint 0x" << o->onode.alloc_hint_flags
<< " expected_object_size " << o->onode.expected_object_size
<< " expected_write_size " << o->onode.expected_write_size
<< std::dec
<< dendl;
_dump_onode<30>(cct, *o);
if (length == 0) {
return 0;
}
WriteContext wctx;
_choose_write_options(c, o, fadvise_flags, &wctx);
if (wctx.compress) {
// if we have compression, skip to write_v1
return _do_write(txc, c, o, offset, length, bl, fadvise_flags);
}
if (bl.length() != length) {
bl.splice(length, bl.length() - length);
}
o->extent_map.fault_range(db, offset, length);
BlueStore::Writer wr(this, txc, &wctx, o);
wr.do_write(offset, bl);
o->extent_map.compress_extent_map(offset, length);
o->extent_map.dirty_range(offset, length);
o->extent_map.maybe_reshard(offset, offset + length);
return r;
}
int BlueStore::_write(TransContext *txc,
CollectionRef& c,
OnodeRef& o,
@ -17301,14 +17395,21 @@ int BlueStore::_write(TransContext *txc,
dout(15) << __func__ << " " << c->cid << " " << o->oid
<< " 0x" << std::hex << offset << "~" << length << std::dec
<< dendl;
auto start = mono_clock::now();
int r = 0;
if (offset + length >= OBJECT_MAX_SIZE) {
r = -E2BIG;
} else {
_assign_nid(txc, o);
r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
if (use_write_v2) {
r = _do_write_v2(txc, c, o, offset, length, bl, fadvise_flags);
} else {
r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
}
txc->write_onode(o);
}
auto finish = mono_clock::now();
logger->tinc(l_bluestore_write_lat, finish - start);
dout(10) << __func__ << " " << c->cid << " " << o->oid
<< " 0x" << std::hex << offset << "~" << length << std::dec
<< " = " << r << dendl;

View File

@ -127,6 +127,7 @@ enum {
// write op stats
//****************************************
l_bluestore_write_lat,
l_bluestore_write_big,
l_bluestore_write_big_bytes,
l_bluestore_write_big_blobs,
@ -725,6 +726,7 @@ public:
o.blob_bl = blob_bl;
#endif
}
void add_tail(uint32_t new_blob_size, uint32_t min_release_size);
void dup(const Blob& from, bool copy_used_in_blob);
void copy_from(CephContext* cct, const Blob& from,
uint32_t min_release_size, uint32_t start, uint32_t len);
@ -747,6 +749,11 @@ public:
/// put logical references, and get back any released extents
bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
PExtentVector *r);
uint32_t put_ref_accumulate(
Collection *coll,
uint32_t offset,
uint32_t length,
PExtentVector *released_disk);
/// split the blob
void split(Collection *coll, uint32_t blob_offset, Blob *o);
@ -1149,6 +1156,10 @@ public:
extent_map_t::iterator seek_lextent(uint64_t offset);
extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
/// split extent
extent_map_t::iterator split_at(extent_map_t::iterator p, uint32_t offset);
/// if inside extent split it, if not return extent on right
extent_map_t::iterator maybe_split_at(uint32_t offset);
/// add a new Extent
void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
extent_map.insert(*new Extent(lo, o, l, b));
@ -1827,6 +1838,14 @@ public:
values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
return *this;
}
bool operator==(const volatile_statfs& rhs) const {
return
values[STATFS_ALLOCATED] == rhs.values[STATFS_ALLOCATED] &&
values[STATFS_STORED] == rhs.values[STATFS_STORED] &&
values[STATFS_COMPRESSED_ORIGINAL] == rhs.values[STATFS_COMPRESSED_ORIGINAL] &&
values[STATFS_COMPRESSED] == rhs.values[STATFS_COMPRESSED] &&
values[STATFS_COMPRESSED_ALLOCATED] == rhs.values[STATFS_COMPRESSED_ALLOCATED];
}
bool is_empty() {
return values[STATFS_ALLOCATED] == 0 &&
values[STATFS_STORED] == 0 &&
@ -2335,6 +2354,9 @@ public:
bool apply_defer();
};
class Writer;
friend class Writer;
// --------------------------------------------------------
// members
private:
@ -2432,6 +2454,7 @@ private:
std::numeric_limits<decltype(min_alloc_size)>::digits,
"not enough bits for min_alloc_size");
bool elastic_shared_blobs = false; ///< use smart ExtentMap::dup to reduce shared blob count
bool use_write_v2 = false; ///< use new write path
enum {
// Please preserve the order since it's DB persistent
@ -3523,6 +3546,29 @@ public:
CephContext* cct, const std::string &path,
const bluestore_bdev_label_t& label, uint64_t disk_position = 0);
void debug_punch_hole_2(
CollectionRef& c,
OnodeRef& o,
uint32_t offset,
uint32_t length,
PExtentVector& released,
std::vector<BlobRef>& pruned_blobs,
std::set<SharedBlobRef>& shared_changed,
volatile_statfs& statfs_delta) {
_punch_hole_2(c.get(), o, offset, length, released,
pruned_blobs, shared_changed, statfs_delta);
}
Allocator*& debug_get_alloc() {
return alloc;
}
void debug_set_block_size(uint64_t _block_size) {
block_size = _block_size;
block_mask = ~(block_size - 1);
block_size_order = std::countr_zero(block_size);
}
void debug_set_prefer_deferred_size(uint64_t s) {
prefer_deferred_size = s;
}
inline void log_latency(const char* name,
int idx,
const ceph::timespan& lat,
@ -3615,12 +3661,13 @@ private:
// --------------------------------------------------------
// write ops
public:
struct WriteContext {
bool buffered = false; ///< buffered write
bool compress = false; ///< compressed write
uint64_t target_blob_size = 0; ///< target (max) blob size
uint8_t csum_type = 0; ///< checksum type for new blobs
unsigned csum_order = 0; ///< target checksum chunk order
uint64_t target_blob_size = 0; ///< target (max) blob size
old_extent_map_t old_extents; ///< must deref these blobs
interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
@ -3669,6 +3716,7 @@ private:
buffered = other.buffered;
compress = other.compress;
target_blob_size = other.target_blob_size;
csum_type = other.csum_type;
csum_order = other.csum_order;
}
void write(
@ -3698,6 +3746,16 @@ private:
uint64_t loffs_end,
uint64_t min_alloc_size);
};
private:
BlueStore::extent_map_t::iterator _punch_hole_2(
Collection* c,
OnodeRef& o,
uint32_t offset,
uint32_t length,
PExtentVector& released,
std::vector<BlobRef>& pruned_blobs,
std::set<SharedBlobRef>& shared_changed,
volatile_statfs& statfs_delta);
void _do_write_small(
TransContext *txc,
CollectionRef &c,
@ -3765,6 +3823,13 @@ private:
uint64_t length,
ceph::buffer::list& bl,
WriteContext *wctx);
int _do_write_v2(
TransContext *txc,
CollectionRef &c,
OnodeRef& o,
uint64_t offset, uint64_t length,
ceph::buffer::list& bl,
uint32_t fadvise_flags);
int _touch(TransContext *txc,
CollectionRef& c,

1425
src/os/bluestore/Writer.cc Normal file

File diff suppressed because it is too large Load Diff

213
src/os/bluestore/Writer.h Normal file
View File

@ -0,0 +1,213 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2023 IBM
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*/
#ifndef BLUESTORE_WRITER
#define BLUESTORE_WRITER
#include "BlueStore.h"
#include "Allocator.h"
class BlueStore::Writer {
public:
using exmp_it = extent_map_t::iterator;
using P = BlueStore::printer;
// Data that is to be put to object.
struct blob_data_t {
//uint32_t location; // There is no need for each chunk to have separate location.
uint32_t real_length; // Size of object data covered by this chunk. Same as object_data.length().
uint32_t compressed_length; // Size of compressed representation. 0 or disk_data.length().
bufferlist disk_data; // Bitstream to got o disk. Its either same as object_data,
// or contains compressed data. Block aligned.
bufferlist object_data; // Object data. Needed to put into caches.
bool is_compressed() const {return compressed_length != 0;}
};
using blob_vec = std::vector<blob_data_t>;
struct blob_data_printer {
const blob_vec& blobs;
uint32_t base_position;
blob_data_printer(const blob_vec& blobs, uint32_t base_position)
: blobs(blobs), base_position(base_position) {}
};
struct write_divertor {
virtual ~write_divertor() = default;
virtual void write(
uint64_t disk_offset, const bufferlist& data, bool deferred) = 0;
};
struct read_divertor {
virtual ~read_divertor() = default;
virtual bufferlist read(uint32_t object_offset, uint32_t object_length) = 0;
};
Writer(BlueStore* bstore, TransContext* txc, WriteContext* wctx, OnodeRef o)
:bstore(bstore), txc(txc), wctx(wctx), onode(o) {
pp_mode = debug_level_to_pp_mode(bstore->cct);
}
public:
void do_write(
uint32_t location,
bufferlist& data
);
void debug_iterate_buffers(
std::function<void(uint32_t offset, const bufferlist& data)> data_callback
);
write_divertor* test_write_divertor = nullptr;
read_divertor* test_read_divertor = nullptr;
std::vector<BlobRef> pruned_blobs;
volatile_statfs statfs_delta;
private:
BlueStore* bstore;
TransContext* txc;
WriteContext* wctx;
OnodeRef onode;
PExtentVector released; //filled by punch_hole
PExtentVector allocated; //filled by alloc()
bool do_deferred = false;
// note: disk_allocs.it is uninitialized.
// it must be initialized in do_write
struct {
PExtentVector::iterator it; //iterator
uint32_t pos; //in-iterator position
} disk_allocs; //disk locations to use when placing data
uint16_t pp_mode = 0; //pretty print mode
uint16_t debug_level_to_pp_mode(CephContext* cct);
inline void _crop_allocs_to_io(
PExtentVector& disk_extents,
uint32_t crop_front,
uint32_t crop_back);
inline exmp_it _find_mutable_blob_left(
exmp_it it,
uint32_t search_begin, // only interested in blobs that are
uint32_t search_end, // within range [begin - end)
uint32_t mapmust_begin,// for 'unused' case: the area
uint32_t mapmust_end); // [begin - end) must be mapped
inline exmp_it _find_mutable_blob_right(
exmp_it it,
uint32_t search_begin, // only interested in blobs that are
uint32_t search_end, // within range [begin - end)
uint32_t mapmust_begin, // for 'unused' case: the area
uint32_t mapmust_end); // [begin - end) must be mapped
inline void _schedule_io_masked(
uint64_t disk_offset,
bufferlist data,
bluestore_blob_t::unused_t mask,
uint32_t chunk_size);
inline void _schedule_io(
const PExtentVector& disk_extents,
bufferlist data);
//Take `length` space from `this.disk_allocs` and put it to `dst`.
void _get_disk_space(
uint32_t length,
PExtentVector& dst);
inline bufferlist _read_self(
uint32_t offset,
uint32_t length);
inline void _maybe_expand_blob(
Blob* blob,
uint32_t new_blob_size);
inline void _blob_put_data(
Blob* blob,
uint32_t in_blob_offset,
bufferlist disk_data);
void _split_data(
uint32_t location,
bufferlist& data,
blob_vec& bd);
void _align_to_disk_block(
uint32_t& location,
uint32_t& ref_end,
blob_vec& blobs
);
inline void _blob_put_data_subau(
Blob* blob,
uint32_t in_blob_offset,
bufferlist disk_data);
inline void _blob_put_data_allocate(
Blob* blob,
uint32_t in_blob_offset,
bufferlist disk_data);
inline void _blob_put_data_subau_allocate(
Blob* blob,
uint32_t in_blob_offset,
bufferlist disk_data);
BlobRef _blob_create_with_data(
uint32_t in_blob_offset,
bufferlist& disk_data);
BlobRef _blob_create_full(
bufferlist& disk_data);
void _try_reuse_allocated_l(
exmp_it after_punch_it, // hint, we could have found it ourselves
uint32_t& logical_offset, // will fix value if something consumed
uint32_t ref_end_offset, // useful when data is padded
blob_data_t& bd); // modified when consumed
void _try_reuse_allocated_r(
exmp_it after_punch_it, // hint, we could have found it ourselves
uint32_t& end_offset, // will fix value if something consumed
uint32_t ref_end_offset, // useful when data is padded
blob_data_t& bd); // modified when consumed
void _try_put_data_on_allocated(
uint32_t& logical_offset,
uint32_t& end_offset,
uint32_t& ref_end_offset,
blob_vec& bd,
exmp_it after_punch_it);
void _do_put_new_blobs(
uint32_t logical_offset,
uint32_t ref_end_offset,
blob_vec::iterator& bd_it,
blob_vec::iterator bd_end);
void _do_put_blobs(
uint32_t logical_offset,
uint32_t data_end_offset,
uint32_t ref_end_offset,
blob_vec& bd,
exmp_it after_punch_it);
std::pair<bool, uint32_t> _write_expand_l(
uint32_t logical_offset);
std::pair<bool, uint32_t> _write_expand_r(
uint32_t end_offset);
void _collect_released_allocated();
void _defer_or_allocate(uint32_t need_size);
};
std::ostream& operator<<(std::ostream& out, const BlueStore::Writer::blob_data_printer& printer);
#endif // BLUESTORE_WRITER

View File

@ -534,6 +534,62 @@ bool bluestore_blob_use_tracker_t::put(
return empty;
}
std::pair<uint32_t, uint32_t> bluestore_blob_use_tracker_t::put_simple(
uint32_t offset, uint32_t length)
{
if (num_au == 0) {
// single tracker for entire blob
ceph_assert(total_bytes >= length);
total_bytes -= length;
if (total_bytes == 0) {
return std::make_pair(0, au_size);
} else {
return std::make_pair(0, 0);
}
} else {
uint32_t clear_start = 0;
uint32_t clear_end = 0;
uint32_t pos = offset / au_size;
uint32_t remain = p2remain(offset, au_size);
if (length <= remain) {
// all in same block
ceph_assert(length <= bytes_per_au[pos]);
bytes_per_au[pos] -= length;
if (bytes_per_au[pos] == 0) {
clear_start = pos * au_size;
clear_end = clear_start + au_size;
}
} else {
// length > remain
ceph_assert(remain <= bytes_per_au[pos]);
bytes_per_au[pos] -= remain;
if (bytes_per_au[pos] == 0) {
clear_start = pos * au_size;
} else {
clear_start = (pos + 1) * au_size;
}
++pos;
length -= remain;
while (length >= au_size) {
ceph_assert(au_size == bytes_per_au[pos]);
bytes_per_au[pos] = 0;
++pos;
length -= au_size;
}
if (length > 0) {
ceph_assert(length <= bytes_per_au[pos]);
bytes_per_au[pos] -= length;
if (bytes_per_au[pos] == 0) {
++pos;
}
}
clear_end = pos * au_size;
}
return std::make_pair(clear_start, clear_end - clear_start);
}
}
bool bluestore_blob_use_tracker_t::can_split() const
{
return num_au > 0;
@ -1080,6 +1136,148 @@ bool bluestore_blob_t::release_extents(bool all,
return false;
}
// Erases allocations from blob's extents and
// appends them to released_disk extents.
// For non-shared blobs it directly represents AUs to release.
// For shared blobs AUs need to be processed by SharesBlob's bluestore_extent_ref_map_t.
// (SharedBlob->persistent->ref_map)
// returns
// disk space size to release
uint32_t bluestore_blob_t::release_extents(
uint32_t offset,
uint32_t length,
PExtentVector* released_disk)
{
uint32_t released_length = 0;
constexpr auto EMPTY = bluestore_pextent_t::INVALID_OFFSET;
if (offset == 0 && length == get_logical_length()) {
released_length = get_ondisk_length();
released_disk->insert(released_disk->end(), extents.begin(), extents.end());
extents.resize(1);
extents[0].offset = EMPTY;
extents[0].length = released_length;
return released_length;
}
bluestore_pextent_t* begin = &*extents.begin();
bluestore_pextent_t* p = &*extents.begin();
bluestore_pextent_t* end = &*extents.end(); //beware - it is fixed in place
bluestore_pextent_t* empty = nullptr;
//skip offset
while (p->length <= offset) {
offset -= p->length;
empty = p->is_valid() ? nullptr : p;
++p;
ceph_assert(p != end); // we assume that length > 0
}
bluestore_pextent_t hold[2]; // by default initialized to zeros
uint32_t hold_size = 0;
uint32_t rem = length;
bluestore_pextent_t* anchor = p;
// copy_to_release
if (/*offset >= 0 &&*/ offset + length < p->length) {
//special case when in same extent
uint64_t p_offset = p->offset;
uint32_t p_length = p->length;
auto anchor_it = extents.begin() + (anchor - begin);
if (offset > 0) {
//anchor_it->offset = p_offset; //it is already there
anchor_it->length = offset;
++anchor_it;
released_disk->emplace_back(p->offset + offset, length);
released_length += length;
anchor_it = extents.insert(anchor_it, 2, bluestore_pextent_t(EMPTY, length));
++anchor_it;
anchor_it->offset = p_offset + offset + length;
anchor_it->length = p_length - offset - length;
} else {
released_disk->emplace_back(p->offset, length);
released_length += length;
if (empty) {
empty->length += length;
} else {
anchor_it = extents.insert(anchor_it, 1, bluestore_pextent_t(EMPTY, length));
++anchor_it;
}
anchor_it->offset = p_offset + length;
anchor_it->length = p_length - length;
}
} else {
// p->length > offset
// offset + length >= p->length
if (offset > 0) {
//activate hold, put pextent that we need; put new empty
ceph_assert(p->is_valid());
hold[0].offset = p->offset;
hold[0].length = offset;
hold[1].offset = EMPTY;
hold[1].length = 0;
empty = &hold[1];
hold_size = 2;
} else {
// offset == 0
if (empty == nullptr) {
//we need empty, activate hold
hold[0].offset = EMPTY;
hold[0].length = 0;
empty = &hold[0];
hold_size = 1;
}
}
// starts copying remainder
if (p->length - offset) {
released_disk->emplace_back(p->offset + offset, p->length - offset);
released_length += p->length - offset;
empty->length += p->length - offset;
rem -= (p->length - offset);
}
++p;
while (rem > 0 && p->length <= rem) {
ceph_assert(p->is_valid());
released_disk->emplace_back(p->offset, p->length);
released_length += p->length;
empty->length += p->length;
rem -= p->length;
++p;
}
if (rem > 0) {
ceph_assert(p->is_valid());
// this we release
released_disk->emplace_back(p->offset, rem);
released_length += rem;
empty->length += rem;
// this much remains
p->offset = p->offset + rem;
p->length = p->length - rem;
//no ++p here; we need this modified p remain part of PExtentVector
} else {
//amazing, clean cut
//if the extent here is empty, we try to meld it
if (p != end && !p->is_valid()) {
empty->length += p->length;
++p;
}
}
// we erase <anchor, p)
// and insert hold in this place
int32_t insert_element_cnt = hold_size - (p - anchor);
auto anchor_it = extents.begin() + (anchor - begin);
if (insert_element_cnt > 0) {
anchor_it = extents.insert(anchor_it, insert_element_cnt, bluestore_pextent_t(0, 0));
}
if (insert_element_cnt < 0) {
anchor_it = extents.erase(anchor_it, anchor_it + (-insert_element_cnt));
}
for (uint32_t i = 0; i < hold_size; i++) {
anchor_it->offset = hold[i].offset;
anchor_it->length = hold[i].length;
++anchor_it;
}
}
return released_length;
}
void bluestore_blob_t::split(uint32_t blob_offset, bluestore_blob_t& rb)
{
size_t left = blob_offset;

View File

@ -16,6 +16,7 @@
#define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
#include <bit>
#include <limits>
#include <ostream>
#include <type_traits>
#include <vector>
@ -388,6 +389,10 @@ struct bluestore_blob_use_tracker_t {
uint32_t full_length,
uint32_t _au_size);
inline void init_and_ref(
uint32_t full_length,
uint32_t tracked_chunk);
void get(
uint32_t offset,
uint32_t len);
@ -403,6 +408,15 @@ struct bluestore_blob_use_tracker_t {
uint32_t len,
PExtentVector *release);
/// Puts back references in region [offset~length].
/// It is different, simpler version of put,
/// as it does not allow for overprovisioning.
/// Releasing off=0x500 len=0x2000 from {0x1000,0x1004,0x1000} will fail,
/// while the other one behaves properly
std::pair<uint32_t, uint32_t> put_simple(
uint32_t offset,
uint32_t length);
bool can_split() const;
bool can_split_at(uint32_t blob_offset) const;
void split(
@ -612,7 +626,9 @@ public:
bool is_shared() const {
return has_flag(FLAG_SHARED);
}
bool has_disk() const {
return extents.size() > 1 || extents.begin()->is_valid();
}
/// return chunk (i.e. min readable block) size for the blob
uint64_t get_chunk_size(uint64_t dev_block_size) const {
return has_csum() ?
@ -730,7 +746,14 @@ public:
}
}
}
/// todo implement me!
unused_t get_unused_mask(uint32_t offset, uint32_t length, uint32_t chunk_size) {
if (has_unused()) {
return 0;
} else {
return 0;
}
}
// map_f_invoke templates intended to mask parameters which are not expected
// by the provided callback
template<class F, typename std::enable_if<std::is_invocable_r_v<
@ -959,7 +982,24 @@ public:
void split(uint32_t blob_offset, bluestore_blob_t& rb);
void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs);
void allocated_full(uint32_t length, PExtentVector&& allocs);
void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
static constexpr uint64_t NO_ALLOCATION = std::numeric_limits<uint64_t>::max();
uint64_t get_allocation_at(uint32_t in_blob_offset) {
uint32_t loc = in_blob_offset;
for (auto e : extents) {
if (loc < e.length) {
//ceph_assert(e.is_valid());
if (e.is_valid()) {
return e.offset + loc;
} else {
return NO_ALLOCATION;
}
}
loc -= e.length;
}
ceph_assert(false);
};
/// updates blob's pextents container and return unused pextents eligible
/// for release.
@ -971,6 +1011,18 @@ public:
bool all,
const PExtentVector& logical,
PExtentVector* r);
/// Remove blob's pextents.
/// [offset~length] - range to remove, in local blob space
/// released_disk - a vector of disk allocation units that are no longer in use;
/// appends to it
/// returns:
/// size of released disk
uint32_t release_extents(
uint32_t offset,
uint32_t length,
PExtentVector* released_disk
);
};
WRITE_CLASS_DENC_FEATURED(bluestore_blob_t)

File diff suppressed because it is too large Load Diff