mirror of
https://github.com/ceph/ceph
synced 2025-02-24 19:47:44 +00:00
Merge pull request #54504 from aclamk/wip-aclamk-bs-refactor-write-path
os/bluestore: Recompression, part 2. New write path.
This commit is contained in:
commit
a787a91719
0
qa/objectstore_debug/%
Normal file
0
qa/objectstore_debug/%
Normal file
@ -0,0 +1,5 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
osd:
|
||||
bluestore write v2 random: true
|
@ -0,0 +1,5 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
osd:
|
||||
bluestore write v2: false
|
@ -0,0 +1,5 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
osd:
|
||||
bluestore write v2: true
|
@ -1 +1 @@
|
||||
.qa/objectstore_debug/bluestore-bitmap.yaml
|
||||
.qa/objectstore_debug/bluestore/bluestore-bitmap.yaml
|
@ -1 +1 @@
|
||||
.qa/objectstore_debug/bluestore-comp-lz4.yaml
|
||||
.qa/objectstore_debug/bluestore/bluestore-comp-lz4.yaml
|
@ -1 +1 @@
|
||||
.qa/objectstore_debug/bluestore-comp-snappy.yaml
|
||||
.qa/objectstore_debug/bluestore/bluestore-comp-snappy.yaml
|
@ -1 +1 @@
|
||||
../thrash-erasure-code/objectstore/bluestore-bitmap.yaml
|
||||
../thrash-erasure-code/objectstore/bluestore/bluestore-bitmap.yaml
|
@ -1 +1 @@
|
||||
.qa/objectstore_debug/bluestore-bitmap.yaml
|
||||
.qa/objectstore_debug/bluestore/bluestore-bitmap.yaml
|
@ -5042,6 +5042,29 @@ options:
|
||||
flags:
|
||||
- create
|
||||
with_legacy: false
|
||||
- name: bluestore_write_v2
|
||||
type: bool
|
||||
level: advanced
|
||||
desc: Use faster write path
|
||||
long_desc: Original write path was developed over long time by constantly adding features.
|
||||
The price was layered inefficiencies gained along the way.
|
||||
Rework of write path done from scratch clears it and optimizes for typical cases.
|
||||
Write_v2 is necessary for recompression feature.
|
||||
default: false
|
||||
flags:
|
||||
- startup
|
||||
with_legacy: false
|
||||
- name: bluestore_write_v2_random
|
||||
type: bool
|
||||
level: advanced
|
||||
desc: Random selection of write path mode
|
||||
long_desc: For testing purposes. If true, value of bluestore_write_v2 is randomly selected.
|
||||
default: false
|
||||
see_also:
|
||||
- bluestore_write_v2
|
||||
flags:
|
||||
- startup
|
||||
with_legacy: false
|
||||
- name: bluestore_allocator
|
||||
type: str
|
||||
level: advanced
|
||||
|
@ -63,6 +63,8 @@ set(alien_store_srcs
|
||||
${PROJECT_SOURCE_DIR}/src/os/bluestore/HybridAllocator.cc
|
||||
${PROJECT_SOURCE_DIR}/src/os/bluestore/StupidAllocator.cc
|
||||
${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapAllocator.cc
|
||||
${PROJECT_SOURCE_DIR}/src/os/bluestore/Writer.cc
|
||||
${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore_debug.cc
|
||||
${PROJECT_SOURCE_DIR}/src/os/memstore/MemStore.cc)
|
||||
add_library(crimson-alienstore STATIC
|
||||
${alien_store_srcs})
|
||||
|
@ -90,6 +90,17 @@ constexpr inline T p2nphase(T x, T align) {
|
||||
return -x & (align - 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* return how much space is left in this block,
|
||||
* when perfectly aligned, return whole block
|
||||
* eg, p2remain(0x1234, 0x100) == 0xcc
|
||||
* eg, p2remain(0x5600, 0x100) == 0x100
|
||||
*/
|
||||
template<typename T>
|
||||
constexpr inline T p2remain(T x, T align) {
|
||||
return align - p2phase(x, align);
|
||||
}
|
||||
|
||||
/*
|
||||
* return x rounded up to an align boundary
|
||||
* eg, p2roundup(0x1234, 0x100) == 0x1300 (0x13*align)
|
||||
|
@ -26,6 +26,7 @@ if(WITH_BLUESTORE)
|
||||
bluestore/BtreeAllocator.cc
|
||||
bluestore/Btree2Allocator.cc
|
||||
bluestore/HybridAllocator.cc
|
||||
bluestore/Writer.cc
|
||||
)
|
||||
endif(WITH_BLUESTORE)
|
||||
|
||||
|
@ -54,6 +54,7 @@
|
||||
#include "common/pretty_binary.h"
|
||||
#include "common/WorkQueue.h"
|
||||
#include "kv/KeyValueHistogram.h"
|
||||
#include "Writer.h"
|
||||
|
||||
#if defined(WITH_LTTNG)
|
||||
#define TRACEPOINT_DEFINE
|
||||
@ -4299,6 +4300,42 @@ BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
|
||||
return fp;
|
||||
}
|
||||
|
||||
// Split extent at desired offset.
|
||||
// Returns iterator to the right part.
|
||||
BlueStore::extent_map_t::iterator BlueStore::ExtentMap::split_at(
|
||||
BlueStore::extent_map_t::iterator p, uint32_t offset)
|
||||
{
|
||||
ceph_assert(p != extent_map.end());
|
||||
ceph_assert(p->logical_offset < offset);
|
||||
ceph_assert(offset < p->logical_end());
|
||||
add(offset, p->blob_offset + (offset - p->logical_offset),
|
||||
p->logical_end() - offset, p->blob);
|
||||
p->length = offset - p->logical_offset;
|
||||
++p;
|
||||
return p;
|
||||
}
|
||||
|
||||
// If inside extent split it, and return right part.
|
||||
// If not inside extent return extent on right.
|
||||
BlueStore::extent_map_t::iterator BlueStore::ExtentMap::maybe_split_at(uint32_t offset)
|
||||
{
|
||||
auto p = seek_lextent(offset);
|
||||
if (p != extent_map.end()) {
|
||||
if (p->logical_offset < offset && offset < p->logical_end()) {
|
||||
// need to split
|
||||
add(offset, p->blob_offset + (offset - p->logical_offset),
|
||||
p->logical_end() - offset, p->blob);
|
||||
p->length = offset - p->logical_offset;
|
||||
++p;
|
||||
// check that we moved to proper extent
|
||||
ceph_assert(p->logical_offset == offset);
|
||||
} else {
|
||||
// the extent is either outside offset or exactly at
|
||||
}
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
|
||||
{
|
||||
auto fp = seek_lextent(offset);
|
||||
@ -6219,6 +6256,9 @@ void BlueStore::_init_logger()
|
||||
|
||||
// write op stats
|
||||
//****************************************
|
||||
b.add_time_avg(l_bluestore_write_lat, "write_lat",
|
||||
"write_op average execution time",
|
||||
"aw", PerfCountersBuilder::PRIO_USEFUL);
|
||||
b.add_u64_counter(l_bluestore_write_big, "write_big",
|
||||
"Large aligned writes into fresh blobs");
|
||||
b.add_u64_counter(l_bluestore_write_big_bytes, "write_big_bytes",
|
||||
@ -9109,7 +9149,12 @@ int BlueStore::_mount()
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
use_write_v2 = cct->_conf.get_val<bool>("bluestore_write_v2");
|
||||
if (cct->_conf.get_val<bool>("bluestore_write_v2_random")) {
|
||||
srand(time(NULL));
|
||||
use_write_v2 = rand() % 2;
|
||||
cct->_conf.set_val("bluestore_write_v2", std::to_string(use_write_v2));
|
||||
}
|
||||
_kv_only = false;
|
||||
if (cct->_conf->bluestore_fsck_on_mount) {
|
||||
int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
|
||||
@ -16640,18 +16685,7 @@ int BlueStore::_do_alloc_write(
|
||||
}
|
||||
|
||||
// checksum
|
||||
int64_t csum = csum_type.load();
|
||||
csum = select_option(
|
||||
"csum_type",
|
||||
csum,
|
||||
[&]() {
|
||||
int64_t val;
|
||||
if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
|
||||
return std::optional<int64_t>(val);
|
||||
}
|
||||
return std::optional<int64_t>();
|
||||
}
|
||||
);
|
||||
int64_t csum = wctx->csum_type;
|
||||
|
||||
// compress (as needed) and calc needed space
|
||||
uint64_t need = 0;
|
||||
@ -17055,6 +17089,21 @@ void BlueStore::_choose_write_options(
|
||||
// apply basic csum block size
|
||||
wctx->csum_order = block_size_order;
|
||||
|
||||
// checksum
|
||||
int64_t csum = csum_type.load();
|
||||
csum = select_option(
|
||||
"csum_type",
|
||||
csum,
|
||||
[&]() {
|
||||
int64_t val;
|
||||
if (c->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
|
||||
return std::optional<int64_t>(val);
|
||||
}
|
||||
return std::optional<int64_t>();
|
||||
}
|
||||
);
|
||||
wctx->csum_type = csum;
|
||||
|
||||
// compression parameters
|
||||
unsigned alloc_hints = o->onode.alloc_hint_flags;
|
||||
auto cm = select_option(
|
||||
@ -17291,6 +17340,51 @@ int BlueStore::_do_write(
|
||||
return r;
|
||||
}
|
||||
|
||||
int BlueStore::_do_write_v2(
|
||||
TransContext *txc,
|
||||
CollectionRef& c,
|
||||
OnodeRef& o,
|
||||
uint64_t offset,
|
||||
uint64_t length,
|
||||
bufferlist& bl,
|
||||
uint32_t fadvise_flags)
|
||||
{
|
||||
int r = 0;
|
||||
|
||||
dout(20) << __func__
|
||||
<< " " << o->oid
|
||||
<< " 0x" << std::hex << offset << "~" << length
|
||||
<< " - have 0x" << o->onode.size
|
||||
<< " (" << std::dec << o->onode.size << ")"
|
||||
<< " bytes" << std::hex
|
||||
<< " fadvise_flags 0x" << fadvise_flags
|
||||
<< " alloc_hint 0x" << o->onode.alloc_hint_flags
|
||||
<< " expected_object_size " << o->onode.expected_object_size
|
||||
<< " expected_write_size " << o->onode.expected_write_size
|
||||
<< std::dec
|
||||
<< dendl;
|
||||
_dump_onode<30>(cct, *o);
|
||||
if (length == 0) {
|
||||
return 0;
|
||||
}
|
||||
WriteContext wctx;
|
||||
_choose_write_options(c, o, fadvise_flags, &wctx);
|
||||
if (wctx.compress) {
|
||||
// if we have compression, skip to write_v1
|
||||
return _do_write(txc, c, o, offset, length, bl, fadvise_flags);
|
||||
}
|
||||
if (bl.length() != length) {
|
||||
bl.splice(length, bl.length() - length);
|
||||
}
|
||||
o->extent_map.fault_range(db, offset, length);
|
||||
BlueStore::Writer wr(this, txc, &wctx, o);
|
||||
wr.do_write(offset, bl);
|
||||
o->extent_map.compress_extent_map(offset, length);
|
||||
o->extent_map.dirty_range(offset, length);
|
||||
o->extent_map.maybe_reshard(offset, offset + length);
|
||||
return r;
|
||||
}
|
||||
|
||||
int BlueStore::_write(TransContext *txc,
|
||||
CollectionRef& c,
|
||||
OnodeRef& o,
|
||||
@ -17301,14 +17395,21 @@ int BlueStore::_write(TransContext *txc,
|
||||
dout(15) << __func__ << " " << c->cid << " " << o->oid
|
||||
<< " 0x" << std::hex << offset << "~" << length << std::dec
|
||||
<< dendl;
|
||||
auto start = mono_clock::now();
|
||||
int r = 0;
|
||||
if (offset + length >= OBJECT_MAX_SIZE) {
|
||||
r = -E2BIG;
|
||||
} else {
|
||||
_assign_nid(txc, o);
|
||||
r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
|
||||
if (use_write_v2) {
|
||||
r = _do_write_v2(txc, c, o, offset, length, bl, fadvise_flags);
|
||||
} else {
|
||||
r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
|
||||
}
|
||||
txc->write_onode(o);
|
||||
}
|
||||
auto finish = mono_clock::now();
|
||||
logger->tinc(l_bluestore_write_lat, finish - start);
|
||||
dout(10) << __func__ << " " << c->cid << " " << o->oid
|
||||
<< " 0x" << std::hex << offset << "~" << length << std::dec
|
||||
<< " = " << r << dendl;
|
||||
|
@ -127,6 +127,7 @@ enum {
|
||||
|
||||
// write op stats
|
||||
//****************************************
|
||||
l_bluestore_write_lat,
|
||||
l_bluestore_write_big,
|
||||
l_bluestore_write_big_bytes,
|
||||
l_bluestore_write_big_blobs,
|
||||
@ -725,6 +726,7 @@ public:
|
||||
o.blob_bl = blob_bl;
|
||||
#endif
|
||||
}
|
||||
void add_tail(uint32_t new_blob_size, uint32_t min_release_size);
|
||||
void dup(const Blob& from, bool copy_used_in_blob);
|
||||
void copy_from(CephContext* cct, const Blob& from,
|
||||
uint32_t min_release_size, uint32_t start, uint32_t len);
|
||||
@ -747,6 +749,11 @@ public:
|
||||
/// put logical references, and get back any released extents
|
||||
bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
|
||||
PExtentVector *r);
|
||||
uint32_t put_ref_accumulate(
|
||||
Collection *coll,
|
||||
uint32_t offset,
|
||||
uint32_t length,
|
||||
PExtentVector *released_disk);
|
||||
/// split the blob
|
||||
void split(Collection *coll, uint32_t blob_offset, Blob *o);
|
||||
|
||||
@ -1149,6 +1156,10 @@ public:
|
||||
extent_map_t::iterator seek_lextent(uint64_t offset);
|
||||
extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
|
||||
|
||||
/// split extent
|
||||
extent_map_t::iterator split_at(extent_map_t::iterator p, uint32_t offset);
|
||||
/// if inside extent split it, if not return extent on right
|
||||
extent_map_t::iterator maybe_split_at(uint32_t offset);
|
||||
/// add a new Extent
|
||||
void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
|
||||
extent_map.insert(*new Extent(lo, o, l, b));
|
||||
@ -1827,6 +1838,14 @@ public:
|
||||
values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
|
||||
return *this;
|
||||
}
|
||||
bool operator==(const volatile_statfs& rhs) const {
|
||||
return
|
||||
values[STATFS_ALLOCATED] == rhs.values[STATFS_ALLOCATED] &&
|
||||
values[STATFS_STORED] == rhs.values[STATFS_STORED] &&
|
||||
values[STATFS_COMPRESSED_ORIGINAL] == rhs.values[STATFS_COMPRESSED_ORIGINAL] &&
|
||||
values[STATFS_COMPRESSED] == rhs.values[STATFS_COMPRESSED] &&
|
||||
values[STATFS_COMPRESSED_ALLOCATED] == rhs.values[STATFS_COMPRESSED_ALLOCATED];
|
||||
}
|
||||
bool is_empty() {
|
||||
return values[STATFS_ALLOCATED] == 0 &&
|
||||
values[STATFS_STORED] == 0 &&
|
||||
@ -2335,6 +2354,9 @@ public:
|
||||
bool apply_defer();
|
||||
};
|
||||
|
||||
class Writer;
|
||||
friend class Writer;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// members
|
||||
private:
|
||||
@ -2432,6 +2454,7 @@ private:
|
||||
std::numeric_limits<decltype(min_alloc_size)>::digits,
|
||||
"not enough bits for min_alloc_size");
|
||||
bool elastic_shared_blobs = false; ///< use smart ExtentMap::dup to reduce shared blob count
|
||||
bool use_write_v2 = false; ///< use new write path
|
||||
|
||||
enum {
|
||||
// Please preserve the order since it's DB persistent
|
||||
@ -3523,6 +3546,29 @@ public:
|
||||
CephContext* cct, const std::string &path,
|
||||
const bluestore_bdev_label_t& label, uint64_t disk_position = 0);
|
||||
|
||||
void debug_punch_hole_2(
|
||||
CollectionRef& c,
|
||||
OnodeRef& o,
|
||||
uint32_t offset,
|
||||
uint32_t length,
|
||||
PExtentVector& released,
|
||||
std::vector<BlobRef>& pruned_blobs,
|
||||
std::set<SharedBlobRef>& shared_changed,
|
||||
volatile_statfs& statfs_delta) {
|
||||
_punch_hole_2(c.get(), o, offset, length, released,
|
||||
pruned_blobs, shared_changed, statfs_delta);
|
||||
}
|
||||
Allocator*& debug_get_alloc() {
|
||||
return alloc;
|
||||
}
|
||||
void debug_set_block_size(uint64_t _block_size) {
|
||||
block_size = _block_size;
|
||||
block_mask = ~(block_size - 1);
|
||||
block_size_order = std::countr_zero(block_size);
|
||||
}
|
||||
void debug_set_prefer_deferred_size(uint64_t s) {
|
||||
prefer_deferred_size = s;
|
||||
}
|
||||
inline void log_latency(const char* name,
|
||||
int idx,
|
||||
const ceph::timespan& lat,
|
||||
@ -3615,12 +3661,13 @@ private:
|
||||
|
||||
// --------------------------------------------------------
|
||||
// write ops
|
||||
|
||||
public:
|
||||
struct WriteContext {
|
||||
bool buffered = false; ///< buffered write
|
||||
bool compress = false; ///< compressed write
|
||||
uint64_t target_blob_size = 0; ///< target (max) blob size
|
||||
uint8_t csum_type = 0; ///< checksum type for new blobs
|
||||
unsigned csum_order = 0; ///< target checksum chunk order
|
||||
uint64_t target_blob_size = 0; ///< target (max) blob size
|
||||
|
||||
old_extent_map_t old_extents; ///< must deref these blobs
|
||||
interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
|
||||
@ -3669,6 +3716,7 @@ private:
|
||||
buffered = other.buffered;
|
||||
compress = other.compress;
|
||||
target_blob_size = other.target_blob_size;
|
||||
csum_type = other.csum_type;
|
||||
csum_order = other.csum_order;
|
||||
}
|
||||
void write(
|
||||
@ -3698,6 +3746,16 @@ private:
|
||||
uint64_t loffs_end,
|
||||
uint64_t min_alloc_size);
|
||||
};
|
||||
private:
|
||||
BlueStore::extent_map_t::iterator _punch_hole_2(
|
||||
Collection* c,
|
||||
OnodeRef& o,
|
||||
uint32_t offset,
|
||||
uint32_t length,
|
||||
PExtentVector& released,
|
||||
std::vector<BlobRef>& pruned_blobs,
|
||||
std::set<SharedBlobRef>& shared_changed,
|
||||
volatile_statfs& statfs_delta);
|
||||
void _do_write_small(
|
||||
TransContext *txc,
|
||||
CollectionRef &c,
|
||||
@ -3765,6 +3823,13 @@ private:
|
||||
uint64_t length,
|
||||
ceph::buffer::list& bl,
|
||||
WriteContext *wctx);
|
||||
int _do_write_v2(
|
||||
TransContext *txc,
|
||||
CollectionRef &c,
|
||||
OnodeRef& o,
|
||||
uint64_t offset, uint64_t length,
|
||||
ceph::buffer::list& bl,
|
||||
uint32_t fadvise_flags);
|
||||
|
||||
int _touch(TransContext *txc,
|
||||
CollectionRef& c,
|
||||
|
1425
src/os/bluestore/Writer.cc
Normal file
1425
src/os/bluestore/Writer.cc
Normal file
File diff suppressed because it is too large
Load Diff
213
src/os/bluestore/Writer.h
Normal file
213
src/os/bluestore/Writer.h
Normal file
@ -0,0 +1,213 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2023 IBM
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*/
|
||||
|
||||
#ifndef BLUESTORE_WRITER
|
||||
#define BLUESTORE_WRITER
|
||||
|
||||
#include "BlueStore.h"
|
||||
#include "Allocator.h"
|
||||
|
||||
class BlueStore::Writer {
|
||||
public:
|
||||
using exmp_it = extent_map_t::iterator;
|
||||
using P = BlueStore::printer;
|
||||
|
||||
// Data that is to be put to object.
|
||||
struct blob_data_t {
|
||||
//uint32_t location; // There is no need for each chunk to have separate location.
|
||||
uint32_t real_length; // Size of object data covered by this chunk. Same as object_data.length().
|
||||
uint32_t compressed_length; // Size of compressed representation. 0 or disk_data.length().
|
||||
bufferlist disk_data; // Bitstream to got o disk. Its either same as object_data,
|
||||
// or contains compressed data. Block aligned.
|
||||
bufferlist object_data; // Object data. Needed to put into caches.
|
||||
bool is_compressed() const {return compressed_length != 0;}
|
||||
};
|
||||
using blob_vec = std::vector<blob_data_t>;
|
||||
struct blob_data_printer {
|
||||
const blob_vec& blobs;
|
||||
uint32_t base_position;
|
||||
blob_data_printer(const blob_vec& blobs, uint32_t base_position)
|
||||
: blobs(blobs), base_position(base_position) {}
|
||||
};
|
||||
|
||||
struct write_divertor {
|
||||
virtual ~write_divertor() = default;
|
||||
virtual void write(
|
||||
uint64_t disk_offset, const bufferlist& data, bool deferred) = 0;
|
||||
};
|
||||
struct read_divertor {
|
||||
virtual ~read_divertor() = default;
|
||||
virtual bufferlist read(uint32_t object_offset, uint32_t object_length) = 0;
|
||||
};
|
||||
Writer(BlueStore* bstore, TransContext* txc, WriteContext* wctx, OnodeRef o)
|
||||
:bstore(bstore), txc(txc), wctx(wctx), onode(o) {
|
||||
pp_mode = debug_level_to_pp_mode(bstore->cct);
|
||||
}
|
||||
public:
|
||||
void do_write(
|
||||
uint32_t location,
|
||||
bufferlist& data
|
||||
);
|
||||
|
||||
void debug_iterate_buffers(
|
||||
std::function<void(uint32_t offset, const bufferlist& data)> data_callback
|
||||
);
|
||||
|
||||
write_divertor* test_write_divertor = nullptr;
|
||||
read_divertor* test_read_divertor = nullptr;
|
||||
std::vector<BlobRef> pruned_blobs;
|
||||
volatile_statfs statfs_delta;
|
||||
|
||||
private:
|
||||
BlueStore* bstore;
|
||||
TransContext* txc;
|
||||
WriteContext* wctx;
|
||||
OnodeRef onode;
|
||||
PExtentVector released; //filled by punch_hole
|
||||
PExtentVector allocated; //filled by alloc()
|
||||
bool do_deferred = false;
|
||||
// note: disk_allocs.it is uninitialized.
|
||||
// it must be initialized in do_write
|
||||
struct {
|
||||
PExtentVector::iterator it; //iterator
|
||||
uint32_t pos; //in-iterator position
|
||||
} disk_allocs; //disk locations to use when placing data
|
||||
uint16_t pp_mode = 0; //pretty print mode
|
||||
uint16_t debug_level_to_pp_mode(CephContext* cct);
|
||||
|
||||
inline void _crop_allocs_to_io(
|
||||
PExtentVector& disk_extents,
|
||||
uint32_t crop_front,
|
||||
uint32_t crop_back);
|
||||
|
||||
inline exmp_it _find_mutable_blob_left(
|
||||
exmp_it it,
|
||||
uint32_t search_begin, // only interested in blobs that are
|
||||
uint32_t search_end, // within range [begin - end)
|
||||
uint32_t mapmust_begin,// for 'unused' case: the area
|
||||
uint32_t mapmust_end); // [begin - end) must be mapped
|
||||
|
||||
inline exmp_it _find_mutable_blob_right(
|
||||
exmp_it it,
|
||||
uint32_t search_begin, // only interested in blobs that are
|
||||
uint32_t search_end, // within range [begin - end)
|
||||
uint32_t mapmust_begin, // for 'unused' case: the area
|
||||
uint32_t mapmust_end); // [begin - end) must be mapped
|
||||
|
||||
inline void _schedule_io_masked(
|
||||
uint64_t disk_offset,
|
||||
bufferlist data,
|
||||
bluestore_blob_t::unused_t mask,
|
||||
uint32_t chunk_size);
|
||||
|
||||
inline void _schedule_io(
|
||||
const PExtentVector& disk_extents,
|
||||
bufferlist data);
|
||||
|
||||
//Take `length` space from `this.disk_allocs` and put it to `dst`.
|
||||
void _get_disk_space(
|
||||
uint32_t length,
|
||||
PExtentVector& dst);
|
||||
|
||||
inline bufferlist _read_self(
|
||||
uint32_t offset,
|
||||
uint32_t length);
|
||||
|
||||
inline void _maybe_expand_blob(
|
||||
Blob* blob,
|
||||
uint32_t new_blob_size);
|
||||
|
||||
inline void _blob_put_data(
|
||||
Blob* blob,
|
||||
uint32_t in_blob_offset,
|
||||
bufferlist disk_data);
|
||||
|
||||
void _split_data(
|
||||
uint32_t location,
|
||||
bufferlist& data,
|
||||
blob_vec& bd);
|
||||
|
||||
void _align_to_disk_block(
|
||||
uint32_t& location,
|
||||
uint32_t& ref_end,
|
||||
blob_vec& blobs
|
||||
);
|
||||
|
||||
inline void _blob_put_data_subau(
|
||||
Blob* blob,
|
||||
uint32_t in_blob_offset,
|
||||
bufferlist disk_data);
|
||||
|
||||
inline void _blob_put_data_allocate(
|
||||
Blob* blob,
|
||||
uint32_t in_blob_offset,
|
||||
bufferlist disk_data);
|
||||
|
||||
inline void _blob_put_data_subau_allocate(
|
||||
Blob* blob,
|
||||
uint32_t in_blob_offset,
|
||||
bufferlist disk_data);
|
||||
|
||||
BlobRef _blob_create_with_data(
|
||||
uint32_t in_blob_offset,
|
||||
bufferlist& disk_data);
|
||||
|
||||
BlobRef _blob_create_full(
|
||||
bufferlist& disk_data);
|
||||
|
||||
void _try_reuse_allocated_l(
|
||||
exmp_it after_punch_it, // hint, we could have found it ourselves
|
||||
uint32_t& logical_offset, // will fix value if something consumed
|
||||
uint32_t ref_end_offset, // useful when data is padded
|
||||
blob_data_t& bd); // modified when consumed
|
||||
|
||||
void _try_reuse_allocated_r(
|
||||
exmp_it after_punch_it, // hint, we could have found it ourselves
|
||||
uint32_t& end_offset, // will fix value if something consumed
|
||||
uint32_t ref_end_offset, // useful when data is padded
|
||||
blob_data_t& bd); // modified when consumed
|
||||
|
||||
void _try_put_data_on_allocated(
|
||||
uint32_t& logical_offset,
|
||||
uint32_t& end_offset,
|
||||
uint32_t& ref_end_offset,
|
||||
blob_vec& bd,
|
||||
exmp_it after_punch_it);
|
||||
|
||||
void _do_put_new_blobs(
|
||||
uint32_t logical_offset,
|
||||
uint32_t ref_end_offset,
|
||||
blob_vec::iterator& bd_it,
|
||||
blob_vec::iterator bd_end);
|
||||
|
||||
void _do_put_blobs(
|
||||
uint32_t logical_offset,
|
||||
uint32_t data_end_offset,
|
||||
uint32_t ref_end_offset,
|
||||
blob_vec& bd,
|
||||
exmp_it after_punch_it);
|
||||
|
||||
std::pair<bool, uint32_t> _write_expand_l(
|
||||
uint32_t logical_offset);
|
||||
|
||||
std::pair<bool, uint32_t> _write_expand_r(
|
||||
uint32_t end_offset);
|
||||
|
||||
void _collect_released_allocated();
|
||||
|
||||
void _defer_or_allocate(uint32_t need_size);
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const BlueStore::Writer::blob_data_printer& printer);
|
||||
|
||||
#endif // BLUESTORE_WRITER
|
@ -534,6 +534,62 @@ bool bluestore_blob_use_tracker_t::put(
|
||||
return empty;
|
||||
}
|
||||
|
||||
|
||||
std::pair<uint32_t, uint32_t> bluestore_blob_use_tracker_t::put_simple(
|
||||
uint32_t offset, uint32_t length)
|
||||
{
|
||||
if (num_au == 0) {
|
||||
// single tracker for entire blob
|
||||
ceph_assert(total_bytes >= length);
|
||||
total_bytes -= length;
|
||||
if (total_bytes == 0) {
|
||||
return std::make_pair(0, au_size);
|
||||
} else {
|
||||
return std::make_pair(0, 0);
|
||||
}
|
||||
} else {
|
||||
uint32_t clear_start = 0;
|
||||
uint32_t clear_end = 0;
|
||||
uint32_t pos = offset / au_size;
|
||||
uint32_t remain = p2remain(offset, au_size);
|
||||
if (length <= remain) {
|
||||
// all in same block
|
||||
ceph_assert(length <= bytes_per_au[pos]);
|
||||
bytes_per_au[pos] -= length;
|
||||
if (bytes_per_au[pos] == 0) {
|
||||
clear_start = pos * au_size;
|
||||
clear_end = clear_start + au_size;
|
||||
}
|
||||
} else {
|
||||
// length > remain
|
||||
ceph_assert(remain <= bytes_per_au[pos]);
|
||||
bytes_per_au[pos] -= remain;
|
||||
if (bytes_per_au[pos] == 0) {
|
||||
clear_start = pos * au_size;
|
||||
} else {
|
||||
clear_start = (pos + 1) * au_size;
|
||||
}
|
||||
++pos;
|
||||
length -= remain;
|
||||
while (length >= au_size) {
|
||||
ceph_assert(au_size == bytes_per_au[pos]);
|
||||
bytes_per_au[pos] = 0;
|
||||
++pos;
|
||||
length -= au_size;
|
||||
}
|
||||
if (length > 0) {
|
||||
ceph_assert(length <= bytes_per_au[pos]);
|
||||
bytes_per_au[pos] -= length;
|
||||
if (bytes_per_au[pos] == 0) {
|
||||
++pos;
|
||||
}
|
||||
}
|
||||
clear_end = pos * au_size;
|
||||
}
|
||||
return std::make_pair(clear_start, clear_end - clear_start);
|
||||
}
|
||||
}
|
||||
|
||||
bool bluestore_blob_use_tracker_t::can_split() const
|
||||
{
|
||||
return num_au > 0;
|
||||
@ -1080,6 +1136,148 @@ bool bluestore_blob_t::release_extents(bool all,
|
||||
return false;
|
||||
}
|
||||
|
||||
// Erases allocations from blob's extents and
|
||||
// appends them to released_disk extents.
|
||||
// For non-shared blobs it directly represents AUs to release.
|
||||
// For shared blobs AUs need to be processed by SharesBlob's bluestore_extent_ref_map_t.
|
||||
// (SharedBlob->persistent->ref_map)
|
||||
// returns
|
||||
// disk space size to release
|
||||
uint32_t bluestore_blob_t::release_extents(
|
||||
uint32_t offset,
|
||||
uint32_t length,
|
||||
PExtentVector* released_disk)
|
||||
{
|
||||
uint32_t released_length = 0;
|
||||
constexpr auto EMPTY = bluestore_pextent_t::INVALID_OFFSET;
|
||||
if (offset == 0 && length == get_logical_length()) {
|
||||
released_length = get_ondisk_length();
|
||||
released_disk->insert(released_disk->end(), extents.begin(), extents.end());
|
||||
extents.resize(1);
|
||||
extents[0].offset = EMPTY;
|
||||
extents[0].length = released_length;
|
||||
return released_length;
|
||||
}
|
||||
bluestore_pextent_t* begin = &*extents.begin();
|
||||
bluestore_pextent_t* p = &*extents.begin();
|
||||
bluestore_pextent_t* end = &*extents.end(); //beware - it is fixed in place
|
||||
|
||||
bluestore_pextent_t* empty = nullptr;
|
||||
//skip offset
|
||||
while (p->length <= offset) {
|
||||
offset -= p->length;
|
||||
empty = p->is_valid() ? nullptr : p;
|
||||
++p;
|
||||
ceph_assert(p != end); // we assume that length > 0
|
||||
}
|
||||
bluestore_pextent_t hold[2]; // by default initialized to zeros
|
||||
uint32_t hold_size = 0;
|
||||
uint32_t rem = length;
|
||||
bluestore_pextent_t* anchor = p;
|
||||
// copy_to_release
|
||||
if (/*offset >= 0 &&*/ offset + length < p->length) {
|
||||
//special case when in same extent
|
||||
uint64_t p_offset = p->offset;
|
||||
uint32_t p_length = p->length;
|
||||
auto anchor_it = extents.begin() + (anchor - begin);
|
||||
if (offset > 0) {
|
||||
//anchor_it->offset = p_offset; //it is already there
|
||||
anchor_it->length = offset;
|
||||
++anchor_it;
|
||||
released_disk->emplace_back(p->offset + offset, length);
|
||||
released_length += length;
|
||||
anchor_it = extents.insert(anchor_it, 2, bluestore_pextent_t(EMPTY, length));
|
||||
++anchor_it;
|
||||
anchor_it->offset = p_offset + offset + length;
|
||||
anchor_it->length = p_length - offset - length;
|
||||
} else {
|
||||
released_disk->emplace_back(p->offset, length);
|
||||
released_length += length;
|
||||
if (empty) {
|
||||
empty->length += length;
|
||||
} else {
|
||||
anchor_it = extents.insert(anchor_it, 1, bluestore_pextent_t(EMPTY, length));
|
||||
++anchor_it;
|
||||
}
|
||||
anchor_it->offset = p_offset + length;
|
||||
anchor_it->length = p_length - length;
|
||||
}
|
||||
} else {
|
||||
// p->length > offset
|
||||
// offset + length >= p->length
|
||||
if (offset > 0) {
|
||||
//activate hold, put pextent that we need; put new empty
|
||||
ceph_assert(p->is_valid());
|
||||
hold[0].offset = p->offset;
|
||||
hold[0].length = offset;
|
||||
hold[1].offset = EMPTY;
|
||||
hold[1].length = 0;
|
||||
empty = &hold[1];
|
||||
hold_size = 2;
|
||||
} else {
|
||||
// offset == 0
|
||||
if (empty == nullptr) {
|
||||
//we need empty, activate hold
|
||||
hold[0].offset = EMPTY;
|
||||
hold[0].length = 0;
|
||||
empty = &hold[0];
|
||||
hold_size = 1;
|
||||
}
|
||||
}
|
||||
// starts copying remainder
|
||||
if (p->length - offset) {
|
||||
released_disk->emplace_back(p->offset + offset, p->length - offset);
|
||||
released_length += p->length - offset;
|
||||
empty->length += p->length - offset;
|
||||
rem -= (p->length - offset);
|
||||
}
|
||||
++p;
|
||||
while (rem > 0 && p->length <= rem) {
|
||||
ceph_assert(p->is_valid());
|
||||
released_disk->emplace_back(p->offset, p->length);
|
||||
released_length += p->length;
|
||||
empty->length += p->length;
|
||||
rem -= p->length;
|
||||
++p;
|
||||
}
|
||||
if (rem > 0) {
|
||||
ceph_assert(p->is_valid());
|
||||
// this we release
|
||||
released_disk->emplace_back(p->offset, rem);
|
||||
released_length += rem;
|
||||
empty->length += rem;
|
||||
// this much remains
|
||||
p->offset = p->offset + rem;
|
||||
p->length = p->length - rem;
|
||||
//no ++p here; we need this modified p remain part of PExtentVector
|
||||
} else {
|
||||
//amazing, clean cut
|
||||
//if the extent here is empty, we try to meld it
|
||||
if (p != end && !p->is_valid()) {
|
||||
empty->length += p->length;
|
||||
++p;
|
||||
}
|
||||
}
|
||||
// we erase <anchor, p)
|
||||
// and insert hold in this place
|
||||
int32_t insert_element_cnt = hold_size - (p - anchor);
|
||||
auto anchor_it = extents.begin() + (anchor - begin);
|
||||
if (insert_element_cnt > 0) {
|
||||
anchor_it = extents.insert(anchor_it, insert_element_cnt, bluestore_pextent_t(0, 0));
|
||||
}
|
||||
if (insert_element_cnt < 0) {
|
||||
anchor_it = extents.erase(anchor_it, anchor_it + (-insert_element_cnt));
|
||||
}
|
||||
for (uint32_t i = 0; i < hold_size; i++) {
|
||||
anchor_it->offset = hold[i].offset;
|
||||
anchor_it->length = hold[i].length;
|
||||
++anchor_it;
|
||||
}
|
||||
}
|
||||
return released_length;
|
||||
}
|
||||
|
||||
|
||||
void bluestore_blob_t::split(uint32_t blob_offset, bluestore_blob_t& rb)
|
||||
{
|
||||
size_t left = blob_offset;
|
||||
|
@ -16,6 +16,7 @@
|
||||
#define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
|
||||
|
||||
#include <bit>
|
||||
#include <limits>
|
||||
#include <ostream>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
@ -388,6 +389,10 @@ struct bluestore_blob_use_tracker_t {
|
||||
uint32_t full_length,
|
||||
uint32_t _au_size);
|
||||
|
||||
inline void init_and_ref(
|
||||
uint32_t full_length,
|
||||
uint32_t tracked_chunk);
|
||||
|
||||
void get(
|
||||
uint32_t offset,
|
||||
uint32_t len);
|
||||
@ -403,6 +408,15 @@ struct bluestore_blob_use_tracker_t {
|
||||
uint32_t len,
|
||||
PExtentVector *release);
|
||||
|
||||
/// Puts back references in region [offset~length].
|
||||
/// It is different, simpler version of put,
|
||||
/// as it does not allow for overprovisioning.
|
||||
/// Releasing off=0x500 len=0x2000 from {0x1000,0x1004,0x1000} will fail,
|
||||
/// while the other one behaves properly
|
||||
std::pair<uint32_t, uint32_t> put_simple(
|
||||
uint32_t offset,
|
||||
uint32_t length);
|
||||
|
||||
bool can_split() const;
|
||||
bool can_split_at(uint32_t blob_offset) const;
|
||||
void split(
|
||||
@ -612,7 +626,9 @@ public:
|
||||
bool is_shared() const {
|
||||
return has_flag(FLAG_SHARED);
|
||||
}
|
||||
|
||||
bool has_disk() const {
|
||||
return extents.size() > 1 || extents.begin()->is_valid();
|
||||
}
|
||||
/// return chunk (i.e. min readable block) size for the blob
|
||||
uint64_t get_chunk_size(uint64_t dev_block_size) const {
|
||||
return has_csum() ?
|
||||
@ -730,7 +746,14 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// todo implement me!
|
||||
unused_t get_unused_mask(uint32_t offset, uint32_t length, uint32_t chunk_size) {
|
||||
if (has_unused()) {
|
||||
return 0;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
// map_f_invoke templates intended to mask parameters which are not expected
|
||||
// by the provided callback
|
||||
template<class F, typename std::enable_if<std::is_invocable_r_v<
|
||||
@ -959,7 +982,24 @@ public:
|
||||
|
||||
void split(uint32_t blob_offset, bluestore_blob_t& rb);
|
||||
void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs);
|
||||
void allocated_full(uint32_t length, PExtentVector&& allocs);
|
||||
void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
|
||||
static constexpr uint64_t NO_ALLOCATION = std::numeric_limits<uint64_t>::max();
|
||||
uint64_t get_allocation_at(uint32_t in_blob_offset) {
|
||||
uint32_t loc = in_blob_offset;
|
||||
for (auto e : extents) {
|
||||
if (loc < e.length) {
|
||||
//ceph_assert(e.is_valid());
|
||||
if (e.is_valid()) {
|
||||
return e.offset + loc;
|
||||
} else {
|
||||
return NO_ALLOCATION;
|
||||
}
|
||||
}
|
||||
loc -= e.length;
|
||||
}
|
||||
ceph_assert(false);
|
||||
};
|
||||
|
||||
/// updates blob's pextents container and return unused pextents eligible
|
||||
/// for release.
|
||||
@ -971,6 +1011,18 @@ public:
|
||||
bool all,
|
||||
const PExtentVector& logical,
|
||||
PExtentVector* r);
|
||||
|
||||
/// Remove blob's pextents.
|
||||
/// [offset~length] - range to remove, in local blob space
|
||||
/// released_disk - a vector of disk allocation units that are no longer in use;
|
||||
/// appends to it
|
||||
/// returns:
|
||||
/// size of released disk
|
||||
uint32_t release_extents(
|
||||
uint32_t offset,
|
||||
uint32_t length,
|
||||
PExtentVector* released_disk
|
||||
);
|
||||
};
|
||||
WRITE_CLASS_DENC_FEATURED(bluestore_blob_t)
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user