Merge pull request #45775 from liu-chunmei/seastore-zero

crimson: seastore add OP_ZERO support

Reviewed-by: Samuel Just <sjust@redhat.com>
Reviewed-by: Chunmei Liu <chunmei.liu@intel.com>
This commit is contained in:
Samuel Just 2022-04-12 20:20:54 -07:00 committed by GitHub
commit 3a21a91528
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 353 additions and 26 deletions

View File

@ -58,6 +58,43 @@ struct extent_to_write_t {
};
using extent_to_write_list_t = std::list<extent_to_write_t>;
/**
* append_extent_to_write
*
* Appends passed extent_to_write_t maintaining invariant that the
* list may not contain consecutive zero elements by checking and
* combining them.
*/
void append_extent_to_write(
extent_to_write_list_t &to_write, extent_to_write_t &&to_append)
{
assert(
to_write.empty() ||
(to_write.back().addr + to_write.back().len) == to_append.addr);
if (to_write.empty() || to_write.back().to_write || to_append.to_write) {
to_write.push_back(std::move(to_append));
} else {
to_write.back().len += to_append.len;
}
}
/**
* splice_extent_to_write
*
* splices passed extent_to_write_list_t maintaining invariant that the
* list may not contain consecutive zero elements by checking and
* combining them.
*/
void splice_extent_to_write(
extent_to_write_list_t &to_write, extent_to_write_list_t &&to_splice)
{
if (!to_splice.empty()) {
append_extent_to_write(to_write, std::move(to_splice.front()));
to_splice.pop_front();
to_write.splice(to_write.end(), std::move(to_splice));
}
}
/// Removes extents/mappings in pins
ObjectDataHandler::write_ret do_removals(
context_t ctx,
@ -177,7 +214,10 @@ split_ret split_pin_left(context_t ctx, LBAPinRef &pin, laddr_t offset)
(zero_extent_len == 0
? std::nullopt
: std::make_optional(extent_to_write_t(pin_offset, zero_extent_len))),
bufferptr(ceph::buffer::create(zero_prepend_len, 0))
(zero_prepend_len == 0
? std::nullopt
: std::make_optional(
bufferptr(ceph::buffer::create(zero_prepend_len, 0))))
);
} else {
// Data, return up to offset to prepend
@ -213,7 +253,10 @@ split_ret split_pin_right(context_t ctx, LBAPinRef &pin, laddr_t end)
(zero_extent_len == 0
? std::nullopt
: std::make_optional(extent_to_write_t(aligned_end, zero_extent_len))),
bufferptr(ceph::buffer::create(zero_suffix_len, 0))
(zero_suffix_len == 0
? std::nullopt
: std::make_optional(
bufferptr(ceph::buffer::create(zero_suffix_len, 0))))
);
} else {
return read_pin(ctx, pin->duplicate()
@ -355,6 +398,80 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
});
}
/**
* get_zero_buffers
*
* Returns extent_to_write_t's reflecting a zero region extending
* from offset~len with headptr optionally on the left and tailptr
* optionally on the right.
*/
extent_to_write_list_t get_zero_buffers(
const extent_len_t block_size,
laddr_t offset, extent_len_t len,
std::optional<bufferptr> &&headptr, std::optional<bufferptr> &&tailptr)
{
auto zero_left = p2roundup(offset, (laddr_t)block_size);
auto zero_right = p2align(offset + len, (laddr_t)block_size);
auto left = headptr ? (offset - headptr->length()) : offset;
auto right = tailptr ?
(offset + len + tailptr->length()) :
(offset + len);
assert(
(headptr && ((zero_left - left) ==
p2roundup(headptr->length(), block_size))) ^
(!headptr && (zero_left == left)));
assert(
(tailptr && ((right - zero_right) ==
p2roundup(tailptr->length(), block_size))) ^
(!tailptr && (right == zero_right)));
assert(right > left);
assert((left % block_size) == 0);
assert((right % block_size) == 0);
// zero region too small for a reserved section,
// headptr and tailptr in same extent
if (zero_right <= zero_left) {
bufferlist bl;
if (headptr) {
bl.append(*headptr);
}
bl.append_zero(
right - left - bl.length() - (tailptr ? tailptr->length() : 0));
if (tailptr) {
bl.append(*tailptr);
}
assert(bl.length() % block_size == 0);
assert(bl.length() == (right - left));
return {{left, bl}};
} else {
// reserved section between ends, headptr and tailptr in different extents
extent_to_write_list_t ret;
if (headptr) {
bufferlist headbl;
headbl.append(*headptr);
headbl.append_zero(zero_left - left - headbl.length());
assert(headbl.length() % block_size == 0);
assert(headbl.length() > 0);
ret.emplace_back(left, headbl);
}
// reserved zero region
ret.emplace_back(zero_left, zero_right - zero_left);
assert(ret.back().len % block_size == 0);
assert(ret.back().len > 0);
if (tailptr) {
bufferlist tailbl;
tailbl.append(*tailptr);
tailbl.append_zero(right - zero_right - tailbl.length());
assert(tailbl.length() % block_size == 0);
assert(tailbl.length() > 0);
ret.emplace_back(zero_right, tailbl);
}
return ret;
}
}
/**
* get_buffers
*
@ -372,61 +489,84 @@ extent_to_write_list_t get_buffers(laddr_t offset, bufferlist &bl)
ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
context_t ctx,
laddr_t _offset,
bufferlist &&bl,
extent_len_t len,
std::optional<bufferlist> &&bl,
lba_pin_list_t &&_pins)
{
if (bl) {
assert(bl->length() == len);
}
return seastar::do_with(
_offset,
std::move(bl),
std::optional<bufferptr>(),
std::move(_pins),
extent_to_write_list_t(),
[ctx](laddr_t &offset, auto &bl, auto &pins, auto &to_write) {
[ctx, len](laddr_t &offset, auto &bl, auto &headptr,
auto &pins, auto &to_write) {
LOG_PREFIX(ObjectDataHandler::overwrite);
DEBUGT("overwrite: {}~{}",
ctx.t,
offset,
bl.length());
len);
ceph_assert(pins.size() >= 1);
auto pin_begin = pins.front()->get_key();
ceph_assert(pin_begin <= offset);
auto pin_end = pins.back()->get_key() + pins.back()->get_length();
ceph_assert(pin_end >= (offset + bl.length()));
ceph_assert(pin_end >= (offset + len));
return split_pin_left(
ctx,
pins.front(),
offset
).si_then([ctx, pin_begin, &offset, &bl, &pins, &to_write](
).si_then([ctx, len, pin_begin, &offset, &headptr, &pins, &to_write](
auto p) {
auto &[left_extent, headptr] = p;
auto &[left_extent, _headptr] = p;
if (left_extent) {
ceph_assert(left_extent->addr == pin_begin);
to_write.push_front(std::move(*left_extent));
append_extent_to_write(to_write, std::move(*left_extent));
}
if (headptr) {
bufferlist newbl;
newbl.append(*headptr);
newbl.append(bl);
bl.swap(newbl);
offset -= headptr->length();
assert_aligned(offset);
if (_headptr) {
assert(_headptr->length() > 0);
headptr = std::move(_headptr);
}
return split_pin_right(
ctx,
pins.back(),
offset + bl.length());
}).si_then([ctx, pin_end, &offset, &bl, &to_write](
auto p) {
offset + len);
}).si_then([ctx, len, pin_begin, pin_end,
&offset, &bl, &headptr, &to_write](auto p) {
auto &[right_extent, tailptr] = p;
if (tailptr) {
bl.append(*tailptr);
assert_aligned(bl.length());
if (bl) {
bufferlist write_bl;
if (headptr) {
write_bl.append(*headptr);
offset -= headptr->length();
assert_aligned(offset);
}
write_bl.claim_append(*bl);
if (tailptr) {
write_bl.append(*tailptr);
assert_aligned(write_bl.length());
}
splice_extent_to_write(to_write, get_buffers(offset, write_bl));
} else {
splice_extent_to_write(
to_write,
get_zero_buffers(
ctx.tm.get_block_size(),
offset,
len,
std::move(headptr),
std::move(tailptr)));
}
to_write.splice(to_write.end(), get_buffers(offset, bl));
if (right_extent) {
ceph_assert((right_extent->addr + right_extent->len) == pin_end);
to_write.push_back(std::move(*right_extent));
append_extent_to_write(to_write, std::move(*right_extent));
}
assert(to_write.size());
assert(pin_begin == to_write.front().addr);
assert(pin_end == (to_write.back().addr + to_write.back().len));
return write_iertr::now();
}).si_then([ctx, &pins] {
return do_removals(ctx, pins);
@ -436,6 +576,41 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
});
}
ObjectDataHandler::zero_ret ObjectDataHandler::zero(
context_t ctx,
objaddr_t offset,
extent_len_t len)
{
return with_object_data(
ctx,
[this, ctx, offset, len](auto &object_data) {
LOG_PREFIX(ObjectDataHandler::zero);
DEBUGT("zero to {}~{}, object_data: {}~{}, is_null {}",
ctx.t,
offset,
len,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len(),
object_data.is_null());
return prepare_data_reservation(
ctx,
object_data,
p2roundup(offset + len, ctx.tm.get_block_size())
).si_then([this, ctx, offset, len, &object_data] {
auto logical_offset = object_data.get_reserved_data_base() + offset;
return ctx.tm.get_pins(
ctx.t,
logical_offset,
len
).si_then([this, ctx, logical_offset, len](auto pins) {
return overwrite(
ctx, logical_offset, len,
std::nullopt, std::move(pins));
});
});
});
}
ObjectDataHandler::write_ret ObjectDataHandler::write(
context_t ctx,
objaddr_t offset,
@ -464,7 +639,9 @@ ObjectDataHandler::write_ret ObjectDataHandler::write(
bl.length()
).si_then([this, ctx,logical_offset, &bl](
auto pins) {
return overwrite(ctx, logical_offset, bufferlist(bl), std::move(pins));
return overwrite(
ctx, logical_offset, bl.length(),
bufferlist(bl), std::move(pins));
});
});
});

View File

@ -66,6 +66,13 @@ public:
objaddr_t offset,
const bufferlist &bl);
using zero_iertr = base_iertr;
using zero_ret = zero_iertr::future<>;
zero_ret zero(
context_t ctx,
objaddr_t offset,
extent_len_t len);
/// Reads data in [offset, offset + len)
using read_iertr = base_iertr;
using read_ret = read_iertr::future<bufferlist>;
@ -99,7 +106,8 @@ private:
write_ret overwrite(
context_t ctx, ///< [in] ctx
laddr_t offset, ///< [in] write offset
bufferlist &&bl, ///< [in] buffer to write
extent_len_t len, ///< [in] len to write, len == bl->length() if bl
std::optional<bufferlist> &&bl, ///< [in] buffer to write, empty for zeros
lba_pin_list_t &&pins ///< [in] set of pins overlapping above region
);

View File

@ -1167,6 +1167,12 @@ SeaStore::tm_ret SeaStore::_do_transaction_step(
i.decode_bl(hint);
return tm_iertr::now();
}
case Transaction::OP_ZERO:
{
objaddr_t off = op->off;
extent_len_t len = op->len;
return _zero(ctx, get_onode(op->oid), off, len);
}
default:
ERROR("bad op {}", static_cast<unsigned>(op->op));
return crimson::ct_error::input_output_error::make();
@ -1225,6 +1231,33 @@ SeaStore::tm_ret SeaStore::_write(
});
}
SeaStore::tm_ret SeaStore::_zero(
internal_context_t &ctx,
OnodeRef &onode,
objaddr_t offset,
extent_len_t len)
{
LOG_PREFIX(SeaStore::_zero);
DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
if (offset + len >= max_object_size) {
return crimson::ct_error::input_output_error::make();
}
auto &object_size = onode->get_mutable_layout(*ctx.transaction).size;
object_size = std::max<uint64_t>(offset + len, object_size);
return seastar::do_with(
ObjectDataHandler(max_object_size),
[=, &ctx, &onode](auto &objhandler) {
return objhandler.zero(
ObjectDataHandler::context_t{
*transaction_manager,
*ctx.transaction,
*onode,
},
offset,
len);
});
}
SeaStore::omap_set_kvs_ret
SeaStore::_omap_set_kvs(
OnodeRef &onode,

View File

@ -339,6 +339,10 @@ private:
uint64_t offset, size_t len,
ceph::bufferlist &&bl,
uint32_t fadvise_flags);
tm_ret _zero(
internal_context_t &ctx,
OnodeRef &onode,
objaddr_t offset, extent_len_t len);
tm_ret _omap_set_values(
internal_context_t &ctx,
OnodeRef &onode,

View File

@ -217,6 +217,53 @@ struct seastore_test_t :
write(seastore, offset, bl);
}
void zero(
SeaStore &seastore,
CTransaction &t,
uint64_t offset,
size_t len) {
ceph::buffer::list bl;
bl.append_zero(len);
bufferlist new_contents;
if (offset > 0 && contents.length()) {
new_contents.substr_of(
contents,
0,
std::min<size_t>(offset, contents.length())
);
}
new_contents.append_zero(offset - new_contents.length());
new_contents.append(bl);
auto tail_offset = offset + bl.length();
if (contents.length() > tail_offset) {
bufferlist tail;
tail.substr_of(
contents,
tail_offset,
contents.length() - tail_offset);
new_contents.append(tail);
}
contents.swap(new_contents);
t.zero(
cid,
oid,
offset,
len);
}
void zero(
SeaStore &seastore,
uint64_t offset,
size_t len) {
CTransaction t;
zero(seastore, t, offset, len);
seastore.do_transaction(
coll,
std::move(t)).get0();
}
void read(
SeaStore &seastore,
uint64_t offset,
@ -739,3 +786,61 @@ TEST_F(seastore_test_t, sparse_read)
test_obj.remove(*seastore);
});
}
TEST_F(seastore_test_t, zero)
{
run_async([this] {
auto test_zero = [this](
// [(off, len, repeat)]
std::vector<std::tuple<uint64_t, uint64_t, uint64_t>> writes,
uint64_t zero_off, uint64_t zero_len) {
// Test zero within a block
auto &test_obj = get_object(make_oid(0));
uint64_t size = 0;
for (auto &[off, len, repeat]: writes) {
for (decltype(repeat) i = 0; i < repeat; ++i) {
test_obj.write(*seastore, off + (len * repeat), len, 'a');
}
size = off + (len * (repeat + 1));
}
test_obj.read(
*seastore,
0,
size);
test_obj.check_size(*seastore);
test_obj.zero(*seastore, zero_off, zero_len);
test_obj.read(
*seastore,
0,
size);
test_obj.check_size(*seastore);
remove_object(test_obj);
};
const uint64_t BS = 4<<10;
// Test zero within a block
test_zero(
{{1<<10, 1<<10, 1}},
1124, 200);
// Multiple writes, partial on left, partial on right.
test_zero(
{{BS, BS, 10}},
BS + 128,
BS * 4);
// Single large write, block boundary on right, partial on left.
test_zero(
{{BS, BS * 10, 1}},
BS + 128,
(BS * 4) - 128);
// Multiple writes, block boundary on left, partial on right.
test_zero(
{{BS, BS, 10}},
BS,
(BS * 4) + 128);
});
}