mirror of
https://github.com/ceph/ceph
synced 2025-01-03 01:22:53 +00:00
os/bluestore: try to split blobs instead of spanning them
If a blob crosses a shard boundary, try to split it into pieces instead of making it spanning. This is only possible under certain conditions (uncompressed, no in-flight writes, no unused space, shard boundary aligned with chunk boundary). Generally speaking this will reduce the number of blobs that end up spanning and reduce the onode key size. Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
parent
8d37c68b56
commit
1a39872b9e
@ -1071,6 +1071,50 @@ void BlueStore::BufferSpace::finish_write(uint64_t seq)
|
||||
cache->_audit("finish_write end");
|
||||
}
|
||||
|
||||
void BlueStore::BufferSpace::split(size_t pos, BlueStore::BufferSpace &r)
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lk(cache->lock);
|
||||
assert(r.cache == cache);
|
||||
auto p = buffer_map.begin();
|
||||
while (p != buffer_map.end() &&
|
||||
p->second->end() <= pos) {
|
||||
dout(30) << __func__ << " skip " << *p->second << dendl;
|
||||
++p;
|
||||
}
|
||||
if (p != buffer_map.end()) {
|
||||
if (p->second->offset < pos) {
|
||||
dout(30) << __func__ << " cut " << *p->second << dendl;
|
||||
size_t left = pos - p->second->offset;
|
||||
size_t right = p->second->length - left;
|
||||
if (p->second->data.length()) {
|
||||
bufferlist bl;
|
||||
bl.substr_of(p->second->data, left, right);
|
||||
r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, 0, bl),
|
||||
0, p->second.get());
|
||||
} else {
|
||||
r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, 0, right),
|
||||
0, p->second.get());
|
||||
}
|
||||
p->second->truncate(left);
|
||||
++p;
|
||||
}
|
||||
while (p != buffer_map.end()) {
|
||||
dout(30) << __func__ << " move " << *p->second << dendl;
|
||||
if (p->second->data.length()) {
|
||||
r._add_buffer(new Buffer(&r, p->second->state, p->second->seq,
|
||||
p->second->offset - pos, p->second->data),
|
||||
0, p->second.get());
|
||||
} else {
|
||||
r._add_buffer(new Buffer(&r, p->second->state, p->second->seq,
|
||||
p->second->offset - pos, p->second->length),
|
||||
0, p->second.get());
|
||||
}
|
||||
_rm_buffer(p++);
|
||||
}
|
||||
}
|
||||
assert(writing_map.empty());
|
||||
}
|
||||
|
||||
// OnodeSpace
|
||||
|
||||
#undef dout_prefix
|
||||
@ -1382,6 +1426,62 @@ bool BlueStore::Blob::put_ref(
|
||||
return false;
|
||||
}
|
||||
|
||||
void BlueStore::Blob::split(size_t blob_offset, Blob *r)
|
||||
{
|
||||
dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
|
||||
<< " start " << *this << dendl;
|
||||
assert(blob.can_split());
|
||||
bluestore_blob_t &lb = dirty_blob();
|
||||
bluestore_blob_t &rb = r->dirty_blob();
|
||||
|
||||
unsigned i = 0;
|
||||
size_t left = blob_offset;
|
||||
for (auto p = lb.extents.begin(); p != lb.extents.end(); ++p, ++i) {
|
||||
if (p->length <= left) {
|
||||
left -= p->length;
|
||||
continue;
|
||||
}
|
||||
if (left) {
|
||||
if (p->is_valid()) {
|
||||
rb.extents.emplace_back(bluestore_pextent_t(p->offset + left,
|
||||
p->length - left));
|
||||
} else {
|
||||
rb.extents.emplace_back(bluestore_pextent_t(
|
||||
bluestore_pextent_t::INVALID_OFFSET,
|
||||
p->length - left));
|
||||
}
|
||||
p->length = left;
|
||||
++i;
|
||||
++p;
|
||||
}
|
||||
while (p != lb.extents.end()) {
|
||||
rb.extents.push_back(*p++);
|
||||
}
|
||||
lb.extents.resize(i);
|
||||
break;
|
||||
}
|
||||
rb.flags = lb.flags;
|
||||
|
||||
if (lb.has_csum()) {
|
||||
rb.csum_type = lb.csum_type;
|
||||
rb.csum_chunk_order = lb.csum_chunk_order;
|
||||
size_t csum_order = lb.get_csum_chunk_size();
|
||||
assert(blob_offset % csum_order == 0);
|
||||
size_t pos = (blob_offset / csum_order) * lb.get_csum_value_size();
|
||||
// deep copy csum data
|
||||
bufferptr old;
|
||||
old.swap(lb.csum_data);
|
||||
rb.csum_data = bufferptr(old.c_str() + pos, old.length() - pos);
|
||||
lb.csum_data = bufferptr(old.c_str(), pos);
|
||||
}
|
||||
|
||||
shared_blob->bc.split(blob_offset, r->shared_blob->bc);
|
||||
|
||||
dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
|
||||
<< " finish " << *this << dendl;
|
||||
dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
|
||||
<< " and " << *r << dendl;
|
||||
}
|
||||
|
||||
// Extent
|
||||
|
||||
@ -1583,10 +1683,43 @@ void BlueStore::ExtentMap::reshard(Onode *o)
|
||||
}
|
||||
if (ep->blob->id < 0 &&
|
||||
ep->blob_escapes_range(shard_start, shard_end - shard_start)) {
|
||||
ep->blob->id = bid++;
|
||||
spanning_blob_map.insert(*ep->blob);
|
||||
ep->blob->get();
|
||||
dout(20) << __func__ << " adding spanning " << *ep << dendl;
|
||||
// We have two options: (1) split the blob into pieces at the
|
||||
// shard boundaries (and adjust extents accordingly), or (2)
|
||||
// mark it spanning. We prefer to cut the blob if we can. Note that
|
||||
// we may have to split it multiple times--potentially at every
|
||||
// shard boundary.
|
||||
bool must_span = false;
|
||||
BlobRef b = ep->blob;
|
||||
if (b->can_split()) {
|
||||
uint32_t bstart = ep->logical_offset - ep->blob_offset;
|
||||
uint32_t bend = bstart + b->get_blob().get_logical_length();
|
||||
for (const auto& sh : shards) {
|
||||
if (bstart < sh.offset && bend > sh.offset) {
|
||||
uint32_t blob_offset = sh.offset - bstart;
|
||||
if (b->get_blob().can_split_at(blob_offset)) {
|
||||
dout(20) << __func__ << " splitting blob, bstart 0x"
|
||||
<< std::hex << bstart
|
||||
<< " blob_offset 0x" << blob_offset
|
||||
<< std::dec << " " << *b << dendl;
|
||||
b = split_blob(b, blob_offset, sh.offset);
|
||||
// switch b to the new right-hand side, in case it
|
||||
// *also* has to get split.
|
||||
bstart += blob_offset;
|
||||
} else {
|
||||
must_span = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
must_span = true;
|
||||
}
|
||||
if (must_span) {
|
||||
b->id = bid++;
|
||||
spanning_blob_map.insert(*b);
|
||||
b->get();
|
||||
dout(20) << __func__ << " adding spanning " << *b << dendl;
|
||||
}
|
||||
}
|
||||
++ep;
|
||||
}
|
||||
@ -2026,6 +2159,48 @@ BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
|
||||
return le;
|
||||
}
|
||||
|
||||
BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
|
||||
BlobRef lb,
|
||||
uint32_t blob_offset,
|
||||
uint32_t pos)
|
||||
{
|
||||
uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
|
||||
dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
|
||||
<< " blob_offset 0x" << blob_offset << std::dec
|
||||
<< " " << *lb << dendl;
|
||||
BlobRef rb = onode->c->new_blob();
|
||||
lb->split(blob_offset, rb.get());
|
||||
|
||||
for (auto ep = seek_lextent(pos);
|
||||
ep != extent_map.end() && ep->logical_offset < end_pos;
|
||||
++ep) {
|
||||
if (ep->blob != lb) {
|
||||
continue;
|
||||
}
|
||||
vector<bluestore_pextent_t> released;
|
||||
if (ep->logical_offset < pos) {
|
||||
// split extent
|
||||
size_t left = pos - ep->logical_offset;
|
||||
Extent *ne = new Extent(pos, 0, ep->length - left, ep->blob_depth, rb);
|
||||
extent_map.insert(*ne);
|
||||
lb->ref_map.put(ep->blob_offset + left, ep->length - left, &released);
|
||||
ep->length = left;
|
||||
rb->ref_map.get(ne->blob_offset, ne->length);
|
||||
dout(30) << __func__ << " split " << *ep << dendl;
|
||||
dout(30) << __func__ << " to " << *ne << dendl;
|
||||
} else {
|
||||
// switch blob
|
||||
assert(ep->blob_offset >= blob_offset);
|
||||
lb->ref_map.put(ep->blob_offset, ep->length, &released);
|
||||
ep->blob = rb;
|
||||
ep->blob_offset -= blob_offset;
|
||||
rb->ref_map.get(ep->blob_offset, ep->length);
|
||||
dout(30) << __func__ << " adjusted " << *ep << dendl;
|
||||
}
|
||||
}
|
||||
return rb;
|
||||
}
|
||||
|
||||
|
||||
// Onode
|
||||
|
||||
@ -6029,9 +6204,8 @@ void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
|
||||
o->extent_map.reshard(o.get());
|
||||
reshard = o->extent_map.update(o.get(), t, true);
|
||||
if (reshard) {
|
||||
derr << __func__ << " warning: still wants reshard, check options?"
|
||||
<< dendl;
|
||||
assert(0 == "reshard problem");
|
||||
dout(20) << __func__ << " warning: still wants reshard, check options?"
|
||||
<< dendl;
|
||||
}
|
||||
logger->inc(l_bluestore_onode_reshard);
|
||||
}
|
||||
|
@ -287,6 +287,8 @@ public:
|
||||
discard(offset, (uint64_t)-1 - offset);
|
||||
}
|
||||
|
||||
void split(size_t pos, BufferSpace &r);
|
||||
|
||||
void dump(Formatter *f) const {
|
||||
std::lock_guard<std::recursive_mutex> l(cache->lock);
|
||||
f->open_array_section("buffers");
|
||||
@ -434,6 +436,11 @@ public:
|
||||
return id >= 0;
|
||||
}
|
||||
|
||||
bool can_split() const {
|
||||
// splitting a BufferSpace writing_map is too hard; don't try.
|
||||
return shared_blob->bc.writing_map.empty() && get_blob().can_split();
|
||||
}
|
||||
|
||||
void dup(Blob& o) {
|
||||
o.shared_blob = shared_blob;
|
||||
o.blob = blob;
|
||||
@ -471,6 +478,9 @@ public:
|
||||
bool put_ref(uint64_t offset, uint64_t length, uint64_t min_alloc_size,
|
||||
vector<bluestore_pextent_t> *r);
|
||||
|
||||
/// split the blob
|
||||
void split(size_t blob_offset, Blob *o);
|
||||
|
||||
void get() {
|
||||
++nref;
|
||||
}
|
||||
@ -530,6 +540,10 @@ public:
|
||||
blob_offset;
|
||||
}
|
||||
|
||||
uint32_t end() const {
|
||||
return logical_offset + length;
|
||||
}
|
||||
|
||||
bool blob_escapes_range(uint32_t o, uint32_t l) {
|
||||
uint32_t bstart = logical_offset - blob_offset;
|
||||
return (bstart < o ||
|
||||
@ -649,6 +663,8 @@ public:
|
||||
uint64_t offset, uint64_t length, uint8_t blob_depth,
|
||||
BlobRef b, extent_map_t *old_extents);
|
||||
|
||||
/// split a blob (and referring extents)
|
||||
BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
|
||||
};
|
||||
|
||||
struct OnodeSpace;
|
||||
|
@ -299,6 +299,19 @@ struct bluestore_blob_t {
|
||||
return csum_data.length() + extents.size() * 16 + 48;
|
||||
}
|
||||
|
||||
bool can_split() const {
|
||||
return
|
||||
!has_flag(FLAG_SHARED) &&
|
||||
!has_flag(FLAG_COMPRESSED) &&
|
||||
!has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex
|
||||
}
|
||||
bool can_split_at(uint32_t blob_offset) const {
|
||||
if (has_csum() &&
|
||||
blob_offset % get_csum_chunk_size() != 0)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
void encode(bufferlist& bl) const;
|
||||
void decode(bufferlist::iterator& p);
|
||||
void dump(Formatter *f) const;
|
||||
|
@ -667,6 +667,80 @@ TEST(Blob, put_ref)
|
||||
}
|
||||
}
|
||||
|
||||
TEST(bluestore_blob_t, can_split)
|
||||
{
|
||||
bluestore_blob_t a;
|
||||
a.flags = bluestore_blob_t::FLAG_MUTABLE;
|
||||
ASSERT_TRUE(a.can_split());
|
||||
a.flags = bluestore_blob_t::FLAG_SHARED;
|
||||
ASSERT_FALSE(a.can_split());
|
||||
a.flags = bluestore_blob_t::FLAG_COMPRESSED;
|
||||
ASSERT_FALSE(a.can_split());
|
||||
a.flags = bluestore_blob_t::FLAG_HAS_UNUSED;
|
||||
ASSERT_FALSE(a.can_split());
|
||||
}
|
||||
|
||||
TEST(bluestore_blob_t, can_split_at)
|
||||
{
|
||||
bluestore_blob_t a;
|
||||
a.flags = bluestore_blob_t::FLAG_MUTABLE;
|
||||
a.extents.emplace_back(bluestore_pextent_t(0x10000, 0x2000));
|
||||
a.extents.emplace_back(bluestore_pextent_t(0x20000, 0x2000));
|
||||
ASSERT_TRUE(a.can_split_at(0x1000));
|
||||
ASSERT_TRUE(a.can_split_at(0x1800));
|
||||
a.init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x4000);
|
||||
ASSERT_TRUE(a.can_split_at(0x1000));
|
||||
ASSERT_TRUE(a.can_split_at(0x2000));
|
||||
ASSERT_TRUE(a.can_split_at(0x3000));
|
||||
ASSERT_FALSE(a.can_split_at(0x2800));
|
||||
}
|
||||
|
||||
TEST(Blob, split)
|
||||
{
|
||||
BlueStore::Cache *cache = BlueStore::Cache::create("lru", NULL);
|
||||
{
|
||||
BlueStore::Blob L, R;
|
||||
L.shared_blob = new BlueStore::SharedBlob(-1, string(), cache);
|
||||
L.shared_blob->get(); // hack to avoid dtor from running
|
||||
R.shared_blob = new BlueStore::SharedBlob(-1, string(), cache);
|
||||
R.shared_blob->get(); // hack to avoid dtor from running
|
||||
L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x2000, 0x2000));
|
||||
L.dirty_blob().init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x2000);
|
||||
L.split(0x1000, &R);
|
||||
ASSERT_EQ(0x1000u, L.get_blob().get_logical_length());
|
||||
ASSERT_EQ(4u, L.get_blob().csum_data.length());
|
||||
ASSERT_EQ(1u, L.get_blob().extents.size());
|
||||
ASSERT_EQ(0x2000u, L.get_blob().extents.front().offset);
|
||||
ASSERT_EQ(0x1000u, L.get_blob().extents.front().length);
|
||||
ASSERT_EQ(0x1000u, R.get_blob().get_logical_length());
|
||||
ASSERT_EQ(4u, R.get_blob().csum_data.length());
|
||||
ASSERT_EQ(1u, R.get_blob().extents.size());
|
||||
ASSERT_EQ(0x3000u, R.get_blob().extents.front().offset);
|
||||
ASSERT_EQ(0x1000u, R.get_blob().extents.front().length);
|
||||
}
|
||||
{
|
||||
BlueStore::Blob L, R;
|
||||
L.shared_blob = new BlueStore::SharedBlob(-1, string(), cache);
|
||||
L.shared_blob->get(); // hack to avoid dtor from running
|
||||
R.shared_blob = new BlueStore::SharedBlob(-1, string(), cache);
|
||||
R.shared_blob->get(); // hack to avoid dtor from running
|
||||
L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x2000, 0x1000));
|
||||
L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x12000, 0x1000));
|
||||
L.dirty_blob().init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x2000);
|
||||
L.split(0x1000, &R);
|
||||
ASSERT_EQ(0x1000u, L.get_blob().get_logical_length());
|
||||
ASSERT_EQ(4u, L.get_blob().csum_data.length());
|
||||
ASSERT_EQ(1u, L.get_blob().extents.size());
|
||||
ASSERT_EQ(0x2000u, L.get_blob().extents.front().offset);
|
||||
ASSERT_EQ(0x1000u, L.get_blob().extents.front().length);
|
||||
ASSERT_EQ(0x1000u, R.get_blob().get_logical_length());
|
||||
ASSERT_EQ(4u, R.get_blob().csum_data.length());
|
||||
ASSERT_EQ(1u, R.get_blob().extents.size());
|
||||
ASSERT_EQ(0x12000u, R.get_blob().extents.front().offset);
|
||||
ASSERT_EQ(0x1000u, R.get_blob().extents.front().length);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ExtentMap, find_lextent)
|
||||
{
|
||||
BlueStore::ExtentMap em(nullptr);
|
||||
|
Loading…
Reference in New Issue
Block a user