Merge pull request #27268 from trociny/wip-38364

librbd: support EC data pool images sparsify

Reviewed-by: Jason Dillaman <dillaman@redhat.com>
This commit is contained in:
Jason Dillaman 2019-04-09 08:57:53 -04:00 committed by GitHub
commit f42d3ffb31
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 252 additions and 76 deletions

View File

@ -5,6 +5,7 @@
#include "cls/rbd/cls_rbd_client.h"
#include "common/dout.h"
#include "common/errno.h"
#include "include/err.h"
#include "librbd/AsyncObjectThrottle.h"
#include "librbd/ExclusiveLock.h"
#include "librbd/ImageCtx.h"
@ -19,6 +20,58 @@
namespace librbd {
namespace operation {
namespace {
bool may_be_trimmed(const std::map<uint64_t,uint64_t> &extent_map,
const bufferlist &bl, size_t sparse_size,
uint64_t *new_end_ptr) {
if (extent_map.empty()) {
*new_end_ptr = 0;
return true;
}
uint64_t end = extent_map.rbegin()->first + extent_map.rbegin()->second;
uint64_t new_end = end;
uint64_t bl_off = bl.length();
for (auto it = extent_map.rbegin(); it != extent_map.rend(); it++) {
auto off = it->first;
auto len = it->second;
new_end = p2roundup<uint64_t>(off + len, sparse_size);
uint64_t extent_left = len;
uint64_t sub_len = len % sparse_size;
if (sub_len == 0) {
sub_len = sparse_size;
}
while (extent_left > 0) {
ceph_assert(bl_off >= sub_len);
bl_off -= sub_len;
bufferlist sub_bl;
sub_bl.substr_of(bl, bl_off, sub_len);
if (!sub_bl.is_zero()) {
break;
}
new_end -= sparse_size;
extent_left -= sub_len;
sub_len = sparse_size;
}
if (extent_left > 0) {
break;
}
}
if (new_end < end) {
*new_end_ptr = new_end;
return true;
}
return false;
}
} // anonymous namespace
using util::create_context_callback;
using util::create_rados_callback;
@ -35,21 +88,24 @@ public:
*
* <start>
* |
* v (object map disabled)
* SPARSIFY -----------------------\
* | |
* | (object map enabled) |
* v |
* PRE UPDATE OBJECT MAP |
* | |
* v |
* CHECK EXISTS |
* | |
* v |
* POST UPDATE OBJECT MAP |
* | |
* v |
* <finish> <----------------------/
* v (not supported)
* SPARSIFY * * * * * * * * * * * * > READ < * * * * * * * * * * (concurrent
* | | * update is
* | (object map disabled) | (can trim) * detected)
* |------------------------\ V *
* | | PRE UPDATE OBJECT MAP *
* | (object map enabled) | | (if needed) *
* v | V *
* PRE UPDATE OBJECT MAP | TRIM * * * * * * * * * * *
* | | |
* v | V
* CHECK EXISTS | POST UPDATE OBJECT MAP
* | | | (if needed)
* v | |
* POST UPDATE OBJECT MAP | |
* | | |
* v | |
* <finish> <------------------/<-------/
*
* @endverbatim
*
@ -112,12 +168,20 @@ public:
void handle_sparsify(int r) {
ldout(m_cct, 20) << "r=" << r << dendl;
if (r < 0 && r != -ENOENT) {
lderr(m_cct) << "failed to sparsify: " << cpp_strerror(r) << dendl;
if (r == -EOPNOTSUPP) {
m_trying_trim = true;
send_read();
return;
}
if (r == -ENOENT) {
this->complete(0);
finish_op(0);
return;
}
if (r < 0) {
lderr(m_cct) << "failed to sparsify: " << cpp_strerror(r) << dendl;
finish_op(r);
return;
}
@ -125,34 +189,41 @@ public:
}
void send_pre_update_object_map() {
I *image_ctx = &this->m_image_ctx;
I &image_ctx = this->m_image_ctx;
if (!m_remove_empty || !image_ctx->test_features(RBD_FEATURE_OBJECT_MAP)) {
this->complete(0);
if (m_trying_trim) {
if (!m_remove_empty || m_new_end != 0 ||
!image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
send_trim();
return;
}
} else if (!m_remove_empty ||
!image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
finish_op(0);
return;
}
ldout(m_cct, 20) << dendl;
image_ctx->owner_lock.get_read();
image_ctx->snap_lock.get_read();
if (image_ctx->object_map == nullptr) {
image_ctx.owner_lock.get_read();
image_ctx.snap_lock.get_read();
if (image_ctx.object_map == nullptr) {
// possible that exclusive lock was lost in background
lderr(m_cct) << "object map is not initialized" << dendl;
image_ctx->snap_lock.put_read();
image_ctx->owner_lock.put_read();
this->complete(-EINVAL);
image_ctx.snap_lock.put_read();
image_ctx.owner_lock.put_read();
finish_op(-EINVAL);
return;
}
int r;
m_finish_op_ctx = image_ctx->exclusive_lock->start_op(&r);
m_finish_op_ctx = image_ctx.exclusive_lock->start_op(&r);
if (m_finish_op_ctx == nullptr) {
lderr(m_cct) << "lost exclusive lock" << dendl;
image_ctx->snap_lock.put_read();
image_ctx->owner_lock.put_read();
this->complete(r);
image_ctx.snap_lock.put_read();
image_ctx.owner_lock.put_read();
finish_op(r);
return;
}
@ -160,17 +231,17 @@ public:
C_SparsifyObject<I>,
&C_SparsifyObject<I>::handle_pre_update_object_map>(this);
image_ctx->object_map_lock.get_write();
bool sent = image_ctx->object_map->template aio_update<
image_ctx.object_map_lock.get_write();
bool sent = image_ctx.object_map->template aio_update<
Context, &Context::complete>(CEPH_NOSNAP, m_object_no, OBJECT_PENDING,
OBJECT_EXISTS, {}, false, ctx);
// NOTE: state machine might complete before we reach here
image_ctx->object_map_lock.put_write();
image_ctx->snap_lock.put_read();
image_ctx->owner_lock.put_read();
image_ctx.object_map_lock.put_write();
image_ctx.snap_lock.put_read();
image_ctx.owner_lock.put_read();
if (!sent) {
ctx->complete(0);
finish_op(0);
}
}
@ -184,7 +255,11 @@ public:
return;
}
send_check_exists();
if (m_trying_trim) {
send_trim();
} else {
send_check_exists();
}
}
void send_check_exists() {
@ -194,10 +269,10 @@ public:
librados::ObjectReadOperation op;
op.stat(NULL, NULL, NULL);
m_out_bl.clear();
m_bl.clear();
auto comp = create_rados_callback<
C_SparsifyObject, &C_SparsifyObject::handle_check_exists>(this);
int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_out_bl);
int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl);
ceph_assert(r == 0);
comp->release();
}
@ -217,6 +292,8 @@ public:
void send_post_update_object_map(bool exists) {
I &image_ctx = this->m_image_ctx;
ldout(m_cct, 20) << dendl;
auto ctx = create_context_callback<
C_SparsifyObject<I>,
&C_SparsifyObject<I>::handle_post_update_object_map>(this);
@ -253,10 +330,100 @@ public:
finish_op(0);
}
void send_read() {
I &image_ctx = this->m_image_ctx;
ldout(m_cct, 20) << dendl;
librados::ObjectReadOperation op;
m_bl.clear();
op.sparse_read(0, image_ctx.layout.object_size, &m_extent_map, &m_bl,
nullptr);
auto comp = create_rados_callback<
C_SparsifyObject, &C_SparsifyObject::handle_read>(this);
int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl);
ceph_assert(r == 0);
comp->release();
}
void handle_read(int r) {
ldout(m_cct, 20) << "r=" << r << dendl;
if (r < 0) {
if (r == -ENOENT) {
r = 0;
} else {
lderr(m_cct) << "failed to read object: " << cpp_strerror(r) << dendl;
}
finish_op(r);
return;
}
if (!may_be_trimmed(m_extent_map, m_bl, m_sparse_size, &m_new_end)) {
finish_op(0);
return;
}
send_pre_update_object_map();
}
void send_trim() {
I &image_ctx = this->m_image_ctx;
ldout(m_cct, 20) << dendl;
ceph_assert(m_new_end < image_ctx.layout.object_size);
librados::ObjectWriteOperation op;
m_bl.clear();
m_bl.append_zero(image_ctx.layout.object_size - m_new_end);
op.cmpext(m_new_end, m_bl, nullptr);
if (m_new_end == 0 && m_remove_empty) {
op.remove();
} else {
op.truncate(m_new_end);
}
auto comp = create_rados_callback<
C_SparsifyObject, &C_SparsifyObject::handle_trim>(this);
int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op);
ceph_assert(r == 0);
comp->release();
}
void handle_trim(int r) {
I &image_ctx = this->m_image_ctx;
ldout(m_cct, 20) << "r=" << r << dendl;
if (r <= -MAX_ERRNO) {
m_finish_op_ctx->complete(0);
m_finish_op_ctx = nullptr;
send_read();
return;
}
if (r < 0 && r != -ENOENT) {
lderr(m_cct) << "failed to trim: " << cpp_strerror(r) << dendl;
finish_op(r);
return;
}
if (!m_remove_empty || m_new_end != 0 ||
!image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
finish_op(0);
return;
}
send_post_update_object_map(false);
}
void finish_op(int r) {
ldout(m_cct, 20) << "r=" << r << dendl;
m_finish_op_ctx->complete(0);
if (m_finish_op_ctx != nullptr) {
m_finish_op_ctx->complete(0);
}
this->complete(r);
}
@ -267,7 +434,10 @@ private:
std::string m_oid;
bool m_remove_empty = false;
bufferlist m_out_bl;
bool m_trying_trim = false;
bufferlist m_bl;
std::map<uint64_t,uint64_t> m_extent_map;
uint64_t m_new_end = 0;
Context *m_finish_op_ctx = nullptr;
};

View File

@ -1365,11 +1365,14 @@ TEST_F(TestInternal, Sparsify) {
librbd::ImageCtx *ictx;
ASSERT_EQ(0, open_image(m_image_name, &ictx));
REQUIRE(is_sparsify_supported(ictx->data_ctx, ictx->get_object_name(10)));
bool sparsify_supported = is_sparsify_supported(ictx->data_ctx,
ictx->get_object_name(10));
bool sparse_read_supported = is_sparse_read_supported(
ictx->data_ctx, ictx->get_object_name(10));
std::cout << "sparsify_supported=" << sparsify_supported << std::endl;
std::cout << "sparse_read_supported=" << sparse_read_supported << std::endl;
librbd::NoOpProgressContext no_op;
ASSERT_EQ(0, ictx->operations->resize((1 << ictx->order) * 20, true, no_op));
@ -1379,13 +1382,24 @@ TEST_F(TestInternal, Sparsify) {
ASSERT_EQ((ssize_t)bl.length(),
ictx->io_work_queue->write(0, bl.length(), bufferlist{bl}, 0));
ASSERT_EQ((ssize_t)bl.length(),
ictx->io_work_queue->write((1 << ictx->order) * 1 + 512,
bl.length(), bufferlist{bl}, 0));
bl.append(std::string(4096, '1'));
bl.append(std::string(4096, '\0'));
bl.append(std::string(4096, '2'));
bl.append(std::string(4096, '\0'));
bl.append(std::string(4096 - 1, '\0'));
ASSERT_EQ((ssize_t)bl.length(),
ictx->io_work_queue->write((1 << ictx->order) * 10, bl.length(),
bufferlist{bl}, 0));
bufferlist bl2;
bl2.append(std::string(4096 - 1, '\0'));
ASSERT_EQ((ssize_t)bl2.length(),
ictx->io_work_queue->write((1 << ictx->order) * 10 + 4096 * 10,
bl2.length(), bufferlist{bl2}, 0));
ASSERT_EQ(0, ictx->io_work_queue->flush());
ASSERT_EQ(0, ictx->operations->sparsify(4096, no_op));
@ -1404,21 +1418,30 @@ TEST_F(TestInternal, Sparsify) {
uint64_t size;
ASSERT_EQ(-ENOENT, ictx->data_ctx.stat(oid, &size, NULL));
if (!sparse_read_supported) {
return;
}
oid = ictx->get_object_name(1);
ASSERT_EQ(-ENOENT, ictx->data_ctx.stat(oid, &size, NULL));
oid = ictx->get_object_name(10);
std::map<uint64_t, uint64_t> m;
read_bl.clear();
ASSERT_EQ(2, ictx->data_ctx.sparse_read(oid, m, read_bl, bl.length(), 0));
std::map<uint64_t, uint64_t> expected_m =
{{4096 * 1, 4096}, {4096 * 3, 4096}};
ASSERT_EQ(m, expected_m);
std::map<uint64_t, uint64_t> expected_m;
auto read_len = bl.length();
bl.clear();
bl.append(std::string(4096, '1'));
bl.append(std::string(4096, '2'));
ASSERT_TRUE(bl.contents_equal(read_bl));
if (sparsify_supported && sparse_read_supported) {
expected_m = {{4096 * 1, 4096}, {4096 * 3, 4096}};
bl.append(std::string(4096, '1'));
bl.append(std::string(4096, '2'));
} else {
expected_m = {{0, 4096 * 4}};
bl.append(std::string(4096, '\0'));
bl.append(std::string(4096, '1'));
bl.append(std::string(4096, '\0'));
bl.append(std::string(4096, '2'));
}
read_bl.clear();
EXPECT_EQ(static_cast<int>(expected_m.size()),
ictx->data_ctx.sparse_read(oid, m, read_bl, read_len, 0));
EXPECT_EQ(m, expected_m);
EXPECT_TRUE(bl.contents_equal(read_bl));
}
@ -1428,10 +1451,9 @@ TEST_F(TestInternal, SparsifyClone) {
librbd::ImageCtx *ictx;
ASSERT_EQ(0, open_image(m_image_name, &ictx));
REQUIRE(is_sparsify_supported(ictx->data_ctx, ictx->get_object_name(10)));
bool sparse_read_supported = is_sparse_read_supported(
ictx->data_ctx, ictx->get_object_name(10));
bool sparsify_supported = is_sparsify_supported(ictx->data_ctx,
ictx->get_object_name(10));
std::cout << "sparsify_supported=" << sparsify_supported << std::endl;
librbd::NoOpProgressContext no_op;
ASSERT_EQ(0, ictx->operations->resize((1 << ictx->order) * 10, true, no_op));
@ -1484,20 +1506,4 @@ TEST_F(TestInternal, SparsifyClone) {
uint64_t size;
ASSERT_EQ(0, ictx->data_ctx.stat(oid, &size, NULL));
ASSERT_EQ(0, ictx->data_ctx.read(oid, read_bl, 4096, 0));
if (!sparse_read_supported) {
return;
}
oid = ictx->get_object_name(10);
std::map<uint64_t, uint64_t> m;
read_bl.clear();
ASSERT_EQ(2, ictx->data_ctx.sparse_read(oid, m, read_bl, bl.length(), 0));
std::map<uint64_t, uint64_t> expected_m =
{{4096 * 1, 4096}, {4096 * 3, 4096}};
ASSERT_EQ(m, expected_m);
bl.clear();
bl.append(std::string(4096, '1'));
bl.append(std::string(4096, '2'));
ASSERT_TRUE(bl.contents_equal(read_bl));
}