os/bluestore/BlueStore.cc: merge overlapping/adjacent regions before read

Fixes: http://tracker.ceph.com/issues/36625
Signed-off-by: Yang Honggang <yanghonggang@umcloud.com>
This commit is contained in:
Yang Honggang 2018-10-30 06:54:39 +00:00
parent 039e29b5dd
commit a86d560635
2 changed files with 168 additions and 43 deletions

View File

@ -7722,20 +7722,20 @@ struct region_t {
uint64_t logical_offset; uint64_t logical_offset;
uint64_t blob_xoffset; //region offset within the blob uint64_t blob_xoffset; //region offset within the blob
uint64_t length; uint64_t length;
bufferlist bl;
// used later in read process // used later in read process
uint64_t front = 0; uint64_t front = 0;
uint64_t r_off = 0;
region_t(uint64_t offset, uint64_t b_offs, uint64_t len) region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
: logical_offset(offset), : logical_offset(offset),
blob_xoffset(b_offs), blob_xoffset(b_offs),
length(len){} length(len),
front(front){}
region_t(const region_t& from) region_t(const region_t& from)
: logical_offset(from.logical_offset), : logical_offset(from.logical_offset),
blob_xoffset(from.blob_xoffset), blob_xoffset(from.blob_xoffset),
length(from.length){} length(from.length),
front(from.front){}
friend ostream& operator<<(ostream& out, const region_t& r) { friend ostream& operator<<(ostream& out, const region_t& r) {
return out << "0x" << std::hex << r.logical_offset << ":" return out << "0x" << std::hex << r.logical_offset << ":"
@ -7743,7 +7743,24 @@ struct region_t {
} }
}; };
typedef list<region_t> regions2read_t; // merged blob read request
struct read_req_t {
uint64_t r_off = 0;
uint64_t r_len = 0;
bufferlist bl;
std::list<region_t> regs; // original read regions
read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
friend ostream& operator<<(ostream& out, const read_req_t& r) {
out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
for (const auto& reg : r.regs)
out << reg;
return out << "]}" << std::dec;
}
};
typedef list<read_req_t> regions2read_t;
typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t; typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
int BlueStore::_do_read( int BlueStore::_do_read(
@ -7832,6 +7849,7 @@ int BlueStore::_do_read(
<< std::dec << dendl; << std::dec << dendl;
auto pc = cache_res.begin(); auto pc = cache_res.begin();
uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
while (b_len > 0) { while (b_len > 0) {
unsigned l; unsigned l;
if (pc != cache_res.end() && if (pc != cache_res.end() &&
@ -7849,7 +7867,36 @@ int BlueStore::_do_read(
} }
dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x" dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
<< b_off << "~" << l << std::dec << dendl; << b_off << "~" << l << std::dec << dendl;
blobs2read[bptr].emplace_back(region_t(pos, b_off, l)); // merge regions
{
uint64_t r_off = b_off;
uint64_t r_len = l;
uint64_t front = r_off % chunk_size;
if (front) {
r_off -= front;
r_len += front;
}
unsigned tail = r_len % chunk_size;
if (tail) {
r_len += chunk_size - tail;
}
bool merged = false;
regions2read_t& r2r = blobs2read[bptr];
if (r2r.size()) {
read_req_t& pre = r2r.back();
if (r_off <= (pre.r_off + pre.r_len)) {
front += (r_off - pre.r_off);
pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
pre.regs.emplace_back(region_t(pos, b_off, l, front));
merged = true;
}
}
if (!merged) {
read_req_t req(r_off, r_len);
req.regs.emplace_back(region_t(pos, b_off, l, front));
r2r.emplace_back(std::move(req));
}
}
++num_regions; ++num_regions;
} }
pos += l; pos += l;
@ -7868,8 +7915,9 @@ int BlueStore::_do_read(
IOContext ioc(cct, NULL, true); // allow EIO IOContext ioc(cct, NULL, true); // allow EIO
for (auto& p : blobs2read) { for (auto& p : blobs2read) {
const BlobRef& bptr = p.first; const BlobRef& bptr = p.first;
regions2read_t& r2r = p.second;
dout(20) << __func__ << " blob " << *bptr << std::hex dout(20) << __func__ << " blob " << *bptr << std::hex
<< " need " << p.second << std::dec << dendl; << " need " << r2r << std::dec << dendl;
if (bptr->get_blob().is_compressed()) { if (bptr->get_blob().is_compressed()) {
// read the whole thing // read the whole thing
if (compressed_blob_bls.empty()) { if (compressed_blob_bls.empty()) {
@ -7883,7 +7931,7 @@ int BlueStore::_do_read(
[&](uint64_t offset, uint64_t length) { [&](uint64_t offset, uint64_t length) {
int r; int r;
// use aio if there are more regions to read than those in this blob // use aio if there are more regions to read than those in this blob
if (num_regions > p.second.size()) { if (num_regions > r2r.size()) {
r = bdev->aio_read(offset, length, &bl, &ioc); r = bdev->aio_read(offset, length, &bl, &ioc);
} else { } else {
r = bdev->read(offset, length, &bl, &ioc, false); r = bdev->read(offset, length, &bl, &ioc, false);
@ -7902,36 +7950,24 @@ int BlueStore::_do_read(
} }
} else { } else {
// read the pieces // read the pieces
for (auto& reg : p.second) { for (auto& req : r2r) {
// determine how much of the blob to read
uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
reg.r_off = reg.blob_xoffset;
uint64_t r_len = reg.length;
reg.front = reg.r_off % chunk_size;
if (reg.front) {
reg.r_off -= reg.front;
r_len += reg.front;
}
unsigned tail = r_len % chunk_size;
if (tail) {
r_len += chunk_size - tail;
}
dout(20) << __func__ << " region 0x" << std::hex dout(20) << __func__ << " region 0x" << std::hex
<< reg.logical_offset << req.regs.front().logical_offset
<< ": 0x" << reg.blob_xoffset << "~" << reg.length << ": 0x" << req.regs.front().blob_xoffset
<< " reading 0x" << reg.r_off << "~" << r_len << std::dec << " reading 0x" << req.r_off
<< "~" << req.r_len << std::dec
<< dendl; << dendl;
// read it // read it
r = bptr->get_blob().map( r = bptr->get_blob().map(
reg.r_off, r_len, req.r_off, req.r_len,
[&](uint64_t offset, uint64_t length) { [&](uint64_t offset, uint64_t length) {
int r; int r;
// use aio if there is more than one region to read // use aio if there is more than one region to read
if (num_regions > 1) { if (num_regions > 1) {
r = bdev->aio_read(offset, length, &reg.bl, &ioc); r = bdev->aio_read(offset, length, &req.bl, &ioc);
} else { } else {
r = bdev->read(offset, length, &reg.bl, &ioc, false); r = bdev->read(offset, length, &req.bl, &ioc, false);
} }
if (r < 0) if (r < 0)
return r; return r;
@ -7946,7 +7982,7 @@ int BlueStore::_do_read(
} }
ceph_assert(r == 0); ceph_assert(r == 0);
} }
ceph_assert(reg.bl.length() == r_len); ceph_assert(req.bl.length() == req.r_len);
} }
} }
} }
@ -7967,16 +8003,18 @@ int BlueStore::_do_read(
blobs2read_t::iterator b2r_it = blobs2read.begin(); blobs2read_t::iterator b2r_it = blobs2read.begin();
while (b2r_it != blobs2read.end()) { while (b2r_it != blobs2read.end()) {
const BlobRef& bptr = b2r_it->first; const BlobRef& bptr = b2r_it->first;
regions2read_t& r2r = b2r_it->second;
dout(20) << __func__ << " blob " << *bptr << std::hex dout(20) << __func__ << " blob " << *bptr << std::hex
<< " need 0x" << b2r_it->second << std::dec << dendl; << " need 0x" << r2r << std::dec << dendl;
if (bptr->get_blob().is_compressed()) { if (bptr->get_blob().is_compressed()) {
ceph_assert(p != compressed_blob_bls.end()); ceph_assert(p != compressed_blob_bls.end());
bufferlist& compressed_bl = *p++; bufferlist& compressed_bl = *p++;
if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl, if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
b2r_it->second.front().logical_offset) < 0) { r2r.front().regs.front().logical_offset) < 0) {
// Handles spurious read errors caused by a kernel bug. // Handles spurious read errors caused by a kernel bug.
// We sometimes get all-zero pages as a result of the read under // We sometimes get all-zero pages as a result of the read under
// high memory pressure. Retrying the failing read succeeds in most cases. // high memory pressure. Retrying the failing read succeeds in most
// cases.
// See also: http://tracker.ceph.com/issues/22464 // See also: http://tracker.ceph.com/issues/22464
if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
return -EIO; return -EIO;
@ -7991,17 +8029,20 @@ int BlueStore::_do_read(
bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0, bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
raw_bl); raw_bl);
} }
for (auto& i : b2r_it->second) { for (auto& req : r2r) {
ready_regions[i.logical_offset].substr_of( for (auto& r : req.regs) {
raw_bl, i.blob_xoffset, i.length); ready_regions[r.logical_offset].substr_of(
raw_bl, r.blob_xoffset, r.length);
}
} }
} else { } else {
for (auto& reg : b2r_it->second) { for (auto& req : r2r) {
if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl, if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
reg.logical_offset) < 0) { req.regs.front().logical_offset) < 0) {
// Handles spurious read errors caused by a kernel bug. // Handles spurious read errors caused by a kernel bug.
// We sometimes get all-zero pages as a result of the read under // We sometimes get all-zero pages as a result of the read under
// high memory pressure. Retrying the failing read succeeds in most cases. // high memory pressure. Retrying the failing read succeeds in most
// cases.
// See also: http://tracker.ceph.com/issues/22464 // See also: http://tracker.ceph.com/issues/22464
if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
return -EIO; return -EIO;
@ -8010,12 +8051,13 @@ int BlueStore::_do_read(
} }
if (buffered) { if (buffered) {
bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
reg.r_off, reg.bl); req.r_off, req.bl);
} }
// prune and keep result // prune and keep result
ready_regions[reg.logical_offset].substr_of( for (const auto& r : req.regs) {
reg.bl, reg.front, reg.length); ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
}
} }
} }
++b2r_it; ++b2r_it;

View File

@ -7339,6 +7339,89 @@ TEST_P(StoreTest, allocateBlueFSTest) {
ASSERT_EQ(r, 0); ASSERT_EQ(r, 0);
} }
TEST_P(StoreTest, mergeRegionTest) {
if (string(GetParam()) != "bluestore")
return;
SetVal(g_conf(), "bluestore_fsck_on_mount", "true");
SetVal(g_conf(), "bluestore_fsck_on_umount", "true");
SetVal(g_conf(), "bdev_debug_inflight_ios", "true");
g_ceph_context->_conf.apply_changes(nullptr);
uint32_t chunk_size = g_ceph_context->_conf->bdev_block_size;
int r = -1;
coll_t cid;
ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP)));
auto ch = store->create_new_collection(cid);
{
ObjectStore::Transaction t;
t.create_collection(cid, 0);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
{
ObjectStore::Transaction t;
t.touch(cid, hoid);
cerr << "Creating object " << hoid << std::endl;
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
bufferlist bl5;
bl5.append("abcde");
uint64_t offset = 0;
{ // 1. same region
ObjectStore::Transaction t;
t.write(cid, hoid, offset, 5, bl5);
t.write(cid, hoid, 0xa + offset, 5, bl5);
t.write(cid, hoid, 0x14 + offset, 5, bl5);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
{ // 2. adjacent regions
ObjectStore::Transaction t;
offset = chunk_size;
t.write(cid, hoid, offset, 5, bl5);
t.write(cid, hoid, offset + chunk_size + 3, 5, bl5);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
{ // 3. front merge
ObjectStore::Transaction t;
offset = chunk_size * 2;
t.write(cid, hoid, offset, 5, bl5);
t.write(cid, hoid, offset + chunk_size - 2, 5, bl5);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
{ // 4. back merge
ObjectStore::Transaction t;
bufferlist blc2;
blc2.append_zero(chunk_size + 2);
offset = chunk_size * 3;
t.write(cid, hoid, offset, chunk_size + 2, blc2);
t.write(cid, hoid, offset + chunk_size + 3, 5, bl5);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
{ // 5. overlapping
ObjectStore::Transaction t;
uint64_t final_len = 0;
offset = chunk_size * 10;
bufferlist bl2c2;
bl2c2.append_zero(chunk_size * 2);
t.write(cid, hoid, offset + chunk_size * 3 - 3, chunk_size * 2, bl2c2);
bl2c2.append_zero(2);
t.write(cid, hoid, offset + chunk_size - 2, chunk_size * 2 + 2, bl2c2);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
final_len = (offset + chunk_size * 3 - 3) + (chunk_size * 2);
bufferlist bl;
r = store->read(ch, hoid, 0, final_len, bl);
ASSERT_EQ(r, final_len);
}
}
#endif // WITH_BLUESTORE #endif // WITH_BLUESTORE
int main(int argc, char **argv) { int main(int argc, char **argv) {