os/bluestore: try to split blobs instead of spanning them

If a blob crosses a shard boundary, try to split it into pieces
instead of making it spanning.  This is only possible under
certain conditions (uncompressed, no in-flight writes, no unused
space, shard boundary aligned with chunk boundary).

Generally speaking this will reduce the number of blobs that end
up spanning and reduce the onode key size.

Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Sage Weil 2016-09-28 17:54:46 -04:00
parent 8d37c68b56
commit 1a39872b9e
4 changed files with 284 additions and 7 deletions

View File

@ -1071,6 +1071,50 @@ void BlueStore::BufferSpace::finish_write(uint64_t seq)
cache->_audit("finish_write end"); cache->_audit("finish_write end");
} }
void BlueStore::BufferSpace::split(size_t pos, BlueStore::BufferSpace &r)
{
std::lock_guard<std::recursive_mutex> lk(cache->lock);
assert(r.cache == cache);
auto p = buffer_map.begin();
while (p != buffer_map.end() &&
p->second->end() <= pos) {
dout(30) << __func__ << " skip " << *p->second << dendl;
++p;
}
if (p != buffer_map.end()) {
if (p->second->offset < pos) {
dout(30) << __func__ << " cut " << *p->second << dendl;
size_t left = pos - p->second->offset;
size_t right = p->second->length - left;
if (p->second->data.length()) {
bufferlist bl;
bl.substr_of(p->second->data, left, right);
r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, 0, bl),
0, p->second.get());
} else {
r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, 0, right),
0, p->second.get());
}
p->second->truncate(left);
++p;
}
while (p != buffer_map.end()) {
dout(30) << __func__ << " move " << *p->second << dendl;
if (p->second->data.length()) {
r._add_buffer(new Buffer(&r, p->second->state, p->second->seq,
p->second->offset - pos, p->second->data),
0, p->second.get());
} else {
r._add_buffer(new Buffer(&r, p->second->state, p->second->seq,
p->second->offset - pos, p->second->length),
0, p->second.get());
}
_rm_buffer(p++);
}
}
assert(writing_map.empty());
}
// OnodeSpace // OnodeSpace
#undef dout_prefix #undef dout_prefix
@ -1382,6 +1426,62 @@ bool BlueStore::Blob::put_ref(
return false; return false;
} }
void BlueStore::Blob::split(size_t blob_offset, Blob *r)
{
dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
<< " start " << *this << dendl;
assert(blob.can_split());
bluestore_blob_t &lb = dirty_blob();
bluestore_blob_t &rb = r->dirty_blob();
unsigned i = 0;
size_t left = blob_offset;
for (auto p = lb.extents.begin(); p != lb.extents.end(); ++p, ++i) {
if (p->length <= left) {
left -= p->length;
continue;
}
if (left) {
if (p->is_valid()) {
rb.extents.emplace_back(bluestore_pextent_t(p->offset + left,
p->length - left));
} else {
rb.extents.emplace_back(bluestore_pextent_t(
bluestore_pextent_t::INVALID_OFFSET,
p->length - left));
}
p->length = left;
++i;
++p;
}
while (p != lb.extents.end()) {
rb.extents.push_back(*p++);
}
lb.extents.resize(i);
break;
}
rb.flags = lb.flags;
if (lb.has_csum()) {
rb.csum_type = lb.csum_type;
rb.csum_chunk_order = lb.csum_chunk_order;
size_t csum_order = lb.get_csum_chunk_size();
assert(blob_offset % csum_order == 0);
size_t pos = (blob_offset / csum_order) * lb.get_csum_value_size();
// deep copy csum data
bufferptr old;
old.swap(lb.csum_data);
rb.csum_data = bufferptr(old.c_str() + pos, old.length() - pos);
lb.csum_data = bufferptr(old.c_str(), pos);
}
shared_blob->bc.split(blob_offset, r->shared_blob->bc);
dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
<< " finish " << *this << dendl;
dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
<< " and " << *r << dendl;
}
// Extent // Extent
@ -1583,10 +1683,43 @@ void BlueStore::ExtentMap::reshard(Onode *o)
} }
if (ep->blob->id < 0 && if (ep->blob->id < 0 &&
ep->blob_escapes_range(shard_start, shard_end - shard_start)) { ep->blob_escapes_range(shard_start, shard_end - shard_start)) {
ep->blob->id = bid++; // We have two options: (1) split the blob into pieces at the
spanning_blob_map.insert(*ep->blob); // shard boundaries (and adjust extents accordingly), or (2)
ep->blob->get(); // mark it spanning. We prefer to cut the blob if we can. Note that
dout(20) << __func__ << " adding spanning " << *ep << dendl; // we may have to split it multiple times--potentially at every
// shard boundary.
bool must_span = false;
BlobRef b = ep->blob;
if (b->can_split()) {
uint32_t bstart = ep->logical_offset - ep->blob_offset;
uint32_t bend = bstart + b->get_blob().get_logical_length();
for (const auto& sh : shards) {
if (bstart < sh.offset && bend > sh.offset) {
uint32_t blob_offset = sh.offset - bstart;
if (b->get_blob().can_split_at(blob_offset)) {
dout(20) << __func__ << " splitting blob, bstart 0x"
<< std::hex << bstart
<< " blob_offset 0x" << blob_offset
<< std::dec << " " << *b << dendl;
b = split_blob(b, blob_offset, sh.offset);
// switch b to the new right-hand side, in case it
// *also* has to get split.
bstart += blob_offset;
} else {
must_span = true;
break;
}
}
}
} else {
must_span = true;
}
if (must_span) {
b->id = bid++;
spanning_blob_map.insert(*b);
b->get();
dout(20) << __func__ << " adding spanning " << *b << dendl;
}
} }
++ep; ++ep;
} }
@ -2026,6 +2159,48 @@ BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
return le; return le;
} }
BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
BlobRef lb,
uint32_t blob_offset,
uint32_t pos)
{
uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
<< " blob_offset 0x" << blob_offset << std::dec
<< " " << *lb << dendl;
BlobRef rb = onode->c->new_blob();
lb->split(blob_offset, rb.get());
for (auto ep = seek_lextent(pos);
ep != extent_map.end() && ep->logical_offset < end_pos;
++ep) {
if (ep->blob != lb) {
continue;
}
vector<bluestore_pextent_t> released;
if (ep->logical_offset < pos) {
// split extent
size_t left = pos - ep->logical_offset;
Extent *ne = new Extent(pos, 0, ep->length - left, ep->blob_depth, rb);
extent_map.insert(*ne);
lb->ref_map.put(ep->blob_offset + left, ep->length - left, &released);
ep->length = left;
rb->ref_map.get(ne->blob_offset, ne->length);
dout(30) << __func__ << " split " << *ep << dendl;
dout(30) << __func__ << " to " << *ne << dendl;
} else {
// switch blob
assert(ep->blob_offset >= blob_offset);
lb->ref_map.put(ep->blob_offset, ep->length, &released);
ep->blob = rb;
ep->blob_offset -= blob_offset;
rb->ref_map.get(ep->blob_offset, ep->length);
dout(30) << __func__ << " adjusted " << *ep << dendl;
}
}
return rb;
}
// Onode // Onode
@ -6029,9 +6204,8 @@ void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
o->extent_map.reshard(o.get()); o->extent_map.reshard(o.get());
reshard = o->extent_map.update(o.get(), t, true); reshard = o->extent_map.update(o.get(), t, true);
if (reshard) { if (reshard) {
derr << __func__ << " warning: still wants reshard, check options?" dout(20) << __func__ << " warning: still wants reshard, check options?"
<< dendl; << dendl;
assert(0 == "reshard problem");
} }
logger->inc(l_bluestore_onode_reshard); logger->inc(l_bluestore_onode_reshard);
} }

View File

@ -287,6 +287,8 @@ public:
discard(offset, (uint64_t)-1 - offset); discard(offset, (uint64_t)-1 - offset);
} }
void split(size_t pos, BufferSpace &r);
void dump(Formatter *f) const { void dump(Formatter *f) const {
std::lock_guard<std::recursive_mutex> l(cache->lock); std::lock_guard<std::recursive_mutex> l(cache->lock);
f->open_array_section("buffers"); f->open_array_section("buffers");
@ -434,6 +436,11 @@ public:
return id >= 0; return id >= 0;
} }
bool can_split() const {
// splitting a BufferSpace writing_map is too hard; don't try.
return shared_blob->bc.writing_map.empty() && get_blob().can_split();
}
void dup(Blob& o) { void dup(Blob& o) {
o.shared_blob = shared_blob; o.shared_blob = shared_blob;
o.blob = blob; o.blob = blob;
@ -471,6 +478,9 @@ public:
bool put_ref(uint64_t offset, uint64_t length, uint64_t min_alloc_size, bool put_ref(uint64_t offset, uint64_t length, uint64_t min_alloc_size,
vector<bluestore_pextent_t> *r); vector<bluestore_pextent_t> *r);
/// split the blob
void split(size_t blob_offset, Blob *o);
void get() { void get() {
++nref; ++nref;
} }
@ -530,6 +540,10 @@ public:
blob_offset; blob_offset;
} }
uint32_t end() const {
return logical_offset + length;
}
bool blob_escapes_range(uint32_t o, uint32_t l) { bool blob_escapes_range(uint32_t o, uint32_t l) {
uint32_t bstart = logical_offset - blob_offset; uint32_t bstart = logical_offset - blob_offset;
return (bstart < o || return (bstart < o ||
@ -649,6 +663,8 @@ public:
uint64_t offset, uint64_t length, uint8_t blob_depth, uint64_t offset, uint64_t length, uint8_t blob_depth,
BlobRef b, extent_map_t *old_extents); BlobRef b, extent_map_t *old_extents);
/// split a blob (and referring extents)
BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
}; };
struct OnodeSpace; struct OnodeSpace;

View File

@ -299,6 +299,19 @@ struct bluestore_blob_t {
return csum_data.length() + extents.size() * 16 + 48; return csum_data.length() + extents.size() * 16 + 48;
} }
bool can_split() const {
return
!has_flag(FLAG_SHARED) &&
!has_flag(FLAG_COMPRESSED) &&
!has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex
}
bool can_split_at(uint32_t blob_offset) const {
if (has_csum() &&
blob_offset % get_csum_chunk_size() != 0)
return false;
return true;
}
void encode(bufferlist& bl) const; void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& p); void decode(bufferlist::iterator& p);
void dump(Formatter *f) const; void dump(Formatter *f) const;

View File

@ -667,6 +667,80 @@ TEST(Blob, put_ref)
} }
} }
TEST(bluestore_blob_t, can_split)
{
bluestore_blob_t a;
a.flags = bluestore_blob_t::FLAG_MUTABLE;
ASSERT_TRUE(a.can_split());
a.flags = bluestore_blob_t::FLAG_SHARED;
ASSERT_FALSE(a.can_split());
a.flags = bluestore_blob_t::FLAG_COMPRESSED;
ASSERT_FALSE(a.can_split());
a.flags = bluestore_blob_t::FLAG_HAS_UNUSED;
ASSERT_FALSE(a.can_split());
}
TEST(bluestore_blob_t, can_split_at)
{
bluestore_blob_t a;
a.flags = bluestore_blob_t::FLAG_MUTABLE;
a.extents.emplace_back(bluestore_pextent_t(0x10000, 0x2000));
a.extents.emplace_back(bluestore_pextent_t(0x20000, 0x2000));
ASSERT_TRUE(a.can_split_at(0x1000));
ASSERT_TRUE(a.can_split_at(0x1800));
a.init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x4000);
ASSERT_TRUE(a.can_split_at(0x1000));
ASSERT_TRUE(a.can_split_at(0x2000));
ASSERT_TRUE(a.can_split_at(0x3000));
ASSERT_FALSE(a.can_split_at(0x2800));
}
TEST(Blob, split)
{
BlueStore::Cache *cache = BlueStore::Cache::create("lru", NULL);
{
BlueStore::Blob L, R;
L.shared_blob = new BlueStore::SharedBlob(-1, string(), cache);
L.shared_blob->get(); // hack to avoid dtor from running
R.shared_blob = new BlueStore::SharedBlob(-1, string(), cache);
R.shared_blob->get(); // hack to avoid dtor from running
L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x2000, 0x2000));
L.dirty_blob().init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x2000);
L.split(0x1000, &R);
ASSERT_EQ(0x1000u, L.get_blob().get_logical_length());
ASSERT_EQ(4u, L.get_blob().csum_data.length());
ASSERT_EQ(1u, L.get_blob().extents.size());
ASSERT_EQ(0x2000u, L.get_blob().extents.front().offset);
ASSERT_EQ(0x1000u, L.get_blob().extents.front().length);
ASSERT_EQ(0x1000u, R.get_blob().get_logical_length());
ASSERT_EQ(4u, R.get_blob().csum_data.length());
ASSERT_EQ(1u, R.get_blob().extents.size());
ASSERT_EQ(0x3000u, R.get_blob().extents.front().offset);
ASSERT_EQ(0x1000u, R.get_blob().extents.front().length);
}
{
BlueStore::Blob L, R;
L.shared_blob = new BlueStore::SharedBlob(-1, string(), cache);
L.shared_blob->get(); // hack to avoid dtor from running
R.shared_blob = new BlueStore::SharedBlob(-1, string(), cache);
R.shared_blob->get(); // hack to avoid dtor from running
L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x2000, 0x1000));
L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x12000, 0x1000));
L.dirty_blob().init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x2000);
L.split(0x1000, &R);
ASSERT_EQ(0x1000u, L.get_blob().get_logical_length());
ASSERT_EQ(4u, L.get_blob().csum_data.length());
ASSERT_EQ(1u, L.get_blob().extents.size());
ASSERT_EQ(0x2000u, L.get_blob().extents.front().offset);
ASSERT_EQ(0x1000u, L.get_blob().extents.front().length);
ASSERT_EQ(0x1000u, R.get_blob().get_logical_length());
ASSERT_EQ(4u, R.get_blob().csum_data.length());
ASSERT_EQ(1u, R.get_blob().extents.size());
ASSERT_EQ(0x12000u, R.get_blob().extents.front().offset);
ASSERT_EQ(0x1000u, R.get_blob().extents.front().length);
}
}
TEST(ExtentMap, find_lextent) TEST(ExtentMap, find_lextent)
{ {
BlueStore::ExtentMap em(nullptr); BlueStore::ExtentMap em(nullptr);