mirror of
https://github.com/ceph/ceph
synced 2025-01-19 17:41:39 +00:00
os/bluestore/BlueStore.cc: merge overlapping/adjacent regions before read
Fixes: http://tracker.ceph.com/issues/36625 Signed-off-by: Yang Honggang <yanghonggang@umcloud.com>
This commit is contained in:
parent
039e29b5dd
commit
a86d560635
@ -7722,20 +7722,20 @@ struct region_t {
|
||||
uint64_t logical_offset;
|
||||
uint64_t blob_xoffset; //region offset within the blob
|
||||
uint64_t length;
|
||||
bufferlist bl;
|
||||
|
||||
// used later in read process
|
||||
uint64_t front = 0;
|
||||
uint64_t r_off = 0;
|
||||
|
||||
region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
|
||||
region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
|
||||
: logical_offset(offset),
|
||||
blob_xoffset(b_offs),
|
||||
length(len){}
|
||||
length(len),
|
||||
front(front){}
|
||||
region_t(const region_t& from)
|
||||
: logical_offset(from.logical_offset),
|
||||
blob_xoffset(from.blob_xoffset),
|
||||
length(from.length){}
|
||||
length(from.length),
|
||||
front(from.front){}
|
||||
|
||||
friend ostream& operator<<(ostream& out, const region_t& r) {
|
||||
return out << "0x" << std::hex << r.logical_offset << ":"
|
||||
@ -7743,7 +7743,24 @@ struct region_t {
|
||||
}
|
||||
};
|
||||
|
||||
typedef list<region_t> regions2read_t;
|
||||
// merged blob read request
|
||||
struct read_req_t {
|
||||
uint64_t r_off = 0;
|
||||
uint64_t r_len = 0;
|
||||
bufferlist bl;
|
||||
std::list<region_t> regs; // original read regions
|
||||
|
||||
read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
|
||||
|
||||
friend ostream& operator<<(ostream& out, const read_req_t& r) {
|
||||
out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
|
||||
for (const auto& reg : r.regs)
|
||||
out << reg;
|
||||
return out << "]}" << std::dec;
|
||||
}
|
||||
};
|
||||
|
||||
typedef list<read_req_t> regions2read_t;
|
||||
typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
|
||||
|
||||
int BlueStore::_do_read(
|
||||
@ -7832,6 +7849,7 @@ int BlueStore::_do_read(
|
||||
<< std::dec << dendl;
|
||||
|
||||
auto pc = cache_res.begin();
|
||||
uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
|
||||
while (b_len > 0) {
|
||||
unsigned l;
|
||||
if (pc != cache_res.end() &&
|
||||
@ -7849,7 +7867,36 @@ int BlueStore::_do_read(
|
||||
}
|
||||
dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
|
||||
<< b_off << "~" << l << std::dec << dendl;
|
||||
blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
|
||||
// merge regions
|
||||
{
|
||||
uint64_t r_off = b_off;
|
||||
uint64_t r_len = l;
|
||||
uint64_t front = r_off % chunk_size;
|
||||
if (front) {
|
||||
r_off -= front;
|
||||
r_len += front;
|
||||
}
|
||||
unsigned tail = r_len % chunk_size;
|
||||
if (tail) {
|
||||
r_len += chunk_size - tail;
|
||||
}
|
||||
bool merged = false;
|
||||
regions2read_t& r2r = blobs2read[bptr];
|
||||
if (r2r.size()) {
|
||||
read_req_t& pre = r2r.back();
|
||||
if (r_off <= (pre.r_off + pre.r_len)) {
|
||||
front += (r_off - pre.r_off);
|
||||
pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
|
||||
pre.regs.emplace_back(region_t(pos, b_off, l, front));
|
||||
merged = true;
|
||||
}
|
||||
}
|
||||
if (!merged) {
|
||||
read_req_t req(r_off, r_len);
|
||||
req.regs.emplace_back(region_t(pos, b_off, l, front));
|
||||
r2r.emplace_back(std::move(req));
|
||||
}
|
||||
}
|
||||
++num_regions;
|
||||
}
|
||||
pos += l;
|
||||
@ -7868,8 +7915,9 @@ int BlueStore::_do_read(
|
||||
IOContext ioc(cct, NULL, true); // allow EIO
|
||||
for (auto& p : blobs2read) {
|
||||
const BlobRef& bptr = p.first;
|
||||
regions2read_t& r2r = p.second;
|
||||
dout(20) << __func__ << " blob " << *bptr << std::hex
|
||||
<< " need " << p.second << std::dec << dendl;
|
||||
<< " need " << r2r << std::dec << dendl;
|
||||
if (bptr->get_blob().is_compressed()) {
|
||||
// read the whole thing
|
||||
if (compressed_blob_bls.empty()) {
|
||||
@ -7883,7 +7931,7 @@ int BlueStore::_do_read(
|
||||
[&](uint64_t offset, uint64_t length) {
|
||||
int r;
|
||||
// use aio if there are more regions to read than those in this blob
|
||||
if (num_regions > p.second.size()) {
|
||||
if (num_regions > r2r.size()) {
|
||||
r = bdev->aio_read(offset, length, &bl, &ioc);
|
||||
} else {
|
||||
r = bdev->read(offset, length, &bl, &ioc, false);
|
||||
@ -7902,36 +7950,24 @@ int BlueStore::_do_read(
|
||||
}
|
||||
} else {
|
||||
// read the pieces
|
||||
for (auto& reg : p.second) {
|
||||
// determine how much of the blob to read
|
||||
uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
|
||||
reg.r_off = reg.blob_xoffset;
|
||||
uint64_t r_len = reg.length;
|
||||
reg.front = reg.r_off % chunk_size;
|
||||
if (reg.front) {
|
||||
reg.r_off -= reg.front;
|
||||
r_len += reg.front;
|
||||
}
|
||||
unsigned tail = r_len % chunk_size;
|
||||
if (tail) {
|
||||
r_len += chunk_size - tail;
|
||||
}
|
||||
for (auto& req : r2r) {
|
||||
dout(20) << __func__ << " region 0x" << std::hex
|
||||
<< reg.logical_offset
|
||||
<< ": 0x" << reg.blob_xoffset << "~" << reg.length
|
||||
<< " reading 0x" << reg.r_off << "~" << r_len << std::dec
|
||||
<< req.regs.front().logical_offset
|
||||
<< ": 0x" << req.regs.front().blob_xoffset
|
||||
<< " reading 0x" << req.r_off
|
||||
<< "~" << req.r_len << std::dec
|
||||
<< dendl;
|
||||
|
||||
// read it
|
||||
r = bptr->get_blob().map(
|
||||
reg.r_off, r_len,
|
||||
req.r_off, req.r_len,
|
||||
[&](uint64_t offset, uint64_t length) {
|
||||
int r;
|
||||
// use aio if there is more than one region to read
|
||||
if (num_regions > 1) {
|
||||
r = bdev->aio_read(offset, length, ®.bl, &ioc);
|
||||
r = bdev->aio_read(offset, length, &req.bl, &ioc);
|
||||
} else {
|
||||
r = bdev->read(offset, length, ®.bl, &ioc, false);
|
||||
r = bdev->read(offset, length, &req.bl, &ioc, false);
|
||||
}
|
||||
if (r < 0)
|
||||
return r;
|
||||
@ -7946,7 +7982,7 @@ int BlueStore::_do_read(
|
||||
}
|
||||
ceph_assert(r == 0);
|
||||
}
|
||||
ceph_assert(reg.bl.length() == r_len);
|
||||
ceph_assert(req.bl.length() == req.r_len);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -7967,16 +8003,18 @@ int BlueStore::_do_read(
|
||||
blobs2read_t::iterator b2r_it = blobs2read.begin();
|
||||
while (b2r_it != blobs2read.end()) {
|
||||
const BlobRef& bptr = b2r_it->first;
|
||||
regions2read_t& r2r = b2r_it->second;
|
||||
dout(20) << __func__ << " blob " << *bptr << std::hex
|
||||
<< " need 0x" << b2r_it->second << std::dec << dendl;
|
||||
<< " need 0x" << r2r << std::dec << dendl;
|
||||
if (bptr->get_blob().is_compressed()) {
|
||||
ceph_assert(p != compressed_blob_bls.end());
|
||||
bufferlist& compressed_bl = *p++;
|
||||
if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
|
||||
b2r_it->second.front().logical_offset) < 0) {
|
||||
r2r.front().regs.front().logical_offset) < 0) {
|
||||
// Handles spurious read errors caused by a kernel bug.
|
||||
// We sometimes get all-zero pages as a result of the read under
|
||||
// high memory pressure. Retrying the failing read succeeds in most cases.
|
||||
// high memory pressure. Retrying the failing read succeeds in most
|
||||
// cases.
|
||||
// See also: http://tracker.ceph.com/issues/22464
|
||||
if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
|
||||
return -EIO;
|
||||
@ -7991,17 +8029,20 @@ int BlueStore::_do_read(
|
||||
bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
|
||||
raw_bl);
|
||||
}
|
||||
for (auto& i : b2r_it->second) {
|
||||
ready_regions[i.logical_offset].substr_of(
|
||||
raw_bl, i.blob_xoffset, i.length);
|
||||
for (auto& req : r2r) {
|
||||
for (auto& r : req.regs) {
|
||||
ready_regions[r.logical_offset].substr_of(
|
||||
raw_bl, r.blob_xoffset, r.length);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto& reg : b2r_it->second) {
|
||||
if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
|
||||
reg.logical_offset) < 0) {
|
||||
for (auto& req : r2r) {
|
||||
if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
|
||||
req.regs.front().logical_offset) < 0) {
|
||||
// Handles spurious read errors caused by a kernel bug.
|
||||
// We sometimes get all-zero pages as a result of the read under
|
||||
// high memory pressure. Retrying the failing read succeeds in most cases.
|
||||
// high memory pressure. Retrying the failing read succeeds in most
|
||||
// cases.
|
||||
// See also: http://tracker.ceph.com/issues/22464
|
||||
if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
|
||||
return -EIO;
|
||||
@ -8010,12 +8051,13 @@ int BlueStore::_do_read(
|
||||
}
|
||||
if (buffered) {
|
||||
bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
|
||||
reg.r_off, reg.bl);
|
||||
req.r_off, req.bl);
|
||||
}
|
||||
|
||||
// prune and keep result
|
||||
ready_regions[reg.logical_offset].substr_of(
|
||||
reg.bl, reg.front, reg.length);
|
||||
for (const auto& r : req.regs) {
|
||||
ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
|
||||
}
|
||||
}
|
||||
}
|
||||
++b2r_it;
|
||||
|
@ -7339,6 +7339,89 @@ TEST_P(StoreTest, allocateBlueFSTest) {
|
||||
ASSERT_EQ(r, 0);
|
||||
}
|
||||
|
||||
TEST_P(StoreTest, mergeRegionTest) {
|
||||
if (string(GetParam()) != "bluestore")
|
||||
return;
|
||||
|
||||
SetVal(g_conf(), "bluestore_fsck_on_mount", "true");
|
||||
SetVal(g_conf(), "bluestore_fsck_on_umount", "true");
|
||||
SetVal(g_conf(), "bdev_debug_inflight_ios", "true");
|
||||
g_ceph_context->_conf.apply_changes(nullptr);
|
||||
|
||||
uint32_t chunk_size = g_ceph_context->_conf->bdev_block_size;
|
||||
int r = -1;
|
||||
coll_t cid;
|
||||
ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP)));
|
||||
auto ch = store->create_new_collection(cid);
|
||||
{
|
||||
ObjectStore::Transaction t;
|
||||
t.create_collection(cid, 0);
|
||||
r = queue_transaction(store, ch, std::move(t));
|
||||
ASSERT_EQ(r, 0);
|
||||
}
|
||||
{
|
||||
ObjectStore::Transaction t;
|
||||
t.touch(cid, hoid);
|
||||
cerr << "Creating object " << hoid << std::endl;
|
||||
r = queue_transaction(store, ch, std::move(t));
|
||||
ASSERT_EQ(r, 0);
|
||||
}
|
||||
bufferlist bl5;
|
||||
bl5.append("abcde");
|
||||
uint64_t offset = 0;
|
||||
{ // 1. same region
|
||||
ObjectStore::Transaction t;
|
||||
t.write(cid, hoid, offset, 5, bl5);
|
||||
t.write(cid, hoid, 0xa + offset, 5, bl5);
|
||||
t.write(cid, hoid, 0x14 + offset, 5, bl5);
|
||||
r = queue_transaction(store, ch, std::move(t));
|
||||
ASSERT_EQ(r, 0);
|
||||
}
|
||||
{ // 2. adjacent regions
|
||||
ObjectStore::Transaction t;
|
||||
offset = chunk_size;
|
||||
t.write(cid, hoid, offset, 5, bl5);
|
||||
t.write(cid, hoid, offset + chunk_size + 3, 5, bl5);
|
||||
r = queue_transaction(store, ch, std::move(t));
|
||||
ASSERT_EQ(r, 0);
|
||||
}
|
||||
{ // 3. front merge
|
||||
ObjectStore::Transaction t;
|
||||
offset = chunk_size * 2;
|
||||
t.write(cid, hoid, offset, 5, bl5);
|
||||
t.write(cid, hoid, offset + chunk_size - 2, 5, bl5);
|
||||
r = queue_transaction(store, ch, std::move(t));
|
||||
ASSERT_EQ(r, 0);
|
||||
}
|
||||
{ // 4. back merge
|
||||
ObjectStore::Transaction t;
|
||||
bufferlist blc2;
|
||||
blc2.append_zero(chunk_size + 2);
|
||||
|
||||
offset = chunk_size * 3;
|
||||
t.write(cid, hoid, offset, chunk_size + 2, blc2);
|
||||
t.write(cid, hoid, offset + chunk_size + 3, 5, bl5);
|
||||
r = queue_transaction(store, ch, std::move(t));
|
||||
ASSERT_EQ(r, 0);
|
||||
}
|
||||
{ // 5. overlapping
|
||||
ObjectStore::Transaction t;
|
||||
uint64_t final_len = 0;
|
||||
offset = chunk_size * 10;
|
||||
bufferlist bl2c2;
|
||||
bl2c2.append_zero(chunk_size * 2);
|
||||
t.write(cid, hoid, offset + chunk_size * 3 - 3, chunk_size * 2, bl2c2);
|
||||
bl2c2.append_zero(2);
|
||||
t.write(cid, hoid, offset + chunk_size - 2, chunk_size * 2 + 2, bl2c2);
|
||||
r = queue_transaction(store, ch, std::move(t));
|
||||
ASSERT_EQ(r, 0);
|
||||
|
||||
final_len = (offset + chunk_size * 3 - 3) + (chunk_size * 2);
|
||||
bufferlist bl;
|
||||
r = store->read(ch, hoid, 0, final_len, bl);
|
||||
ASSERT_EQ(r, final_len);
|
||||
}
|
||||
}
|
||||
#endif // WITH_BLUESTORE
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
Loading…
Reference in New Issue
Block a user