mirror of
https://github.com/ceph/ceph
synced 2025-01-16 16:03:29 +00:00
Merge pull request #45775 from liu-chunmei/seastore-zero
crimson: seastore add OP_ZERO support Reviewed-by: Samuel Just <sjust@redhat.com> Reviewed-by: Chunmei Liu <chunmei.liu@intel.com>
This commit is contained in:
commit
3a21a91528
@ -58,6 +58,43 @@ struct extent_to_write_t {
|
||||
};
|
||||
using extent_to_write_list_t = std::list<extent_to_write_t>;
|
||||
|
||||
/**
|
||||
* append_extent_to_write
|
||||
*
|
||||
* Appends passed extent_to_write_t maintaining invariant that the
|
||||
* list may not contain consecutive zero elements by checking and
|
||||
* combining them.
|
||||
*/
|
||||
void append_extent_to_write(
|
||||
extent_to_write_list_t &to_write, extent_to_write_t &&to_append)
|
||||
{
|
||||
assert(
|
||||
to_write.empty() ||
|
||||
(to_write.back().addr + to_write.back().len) == to_append.addr);
|
||||
if (to_write.empty() || to_write.back().to_write || to_append.to_write) {
|
||||
to_write.push_back(std::move(to_append));
|
||||
} else {
|
||||
to_write.back().len += to_append.len;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* splice_extent_to_write
|
||||
*
|
||||
* splices passed extent_to_write_list_t maintaining invariant that the
|
||||
* list may not contain consecutive zero elements by checking and
|
||||
* combining them.
|
||||
*/
|
||||
void splice_extent_to_write(
|
||||
extent_to_write_list_t &to_write, extent_to_write_list_t &&to_splice)
|
||||
{
|
||||
if (!to_splice.empty()) {
|
||||
append_extent_to_write(to_write, std::move(to_splice.front()));
|
||||
to_splice.pop_front();
|
||||
to_write.splice(to_write.end(), std::move(to_splice));
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes extents/mappings in pins
|
||||
ObjectDataHandler::write_ret do_removals(
|
||||
context_t ctx,
|
||||
@ -177,7 +214,10 @@ split_ret split_pin_left(context_t ctx, LBAPinRef &pin, laddr_t offset)
|
||||
(zero_extent_len == 0
|
||||
? std::nullopt
|
||||
: std::make_optional(extent_to_write_t(pin_offset, zero_extent_len))),
|
||||
bufferptr(ceph::buffer::create(zero_prepend_len, 0))
|
||||
(zero_prepend_len == 0
|
||||
? std::nullopt
|
||||
: std::make_optional(
|
||||
bufferptr(ceph::buffer::create(zero_prepend_len, 0))))
|
||||
);
|
||||
} else {
|
||||
// Data, return up to offset to prepend
|
||||
@ -213,7 +253,10 @@ split_ret split_pin_right(context_t ctx, LBAPinRef &pin, laddr_t end)
|
||||
(zero_extent_len == 0
|
||||
? std::nullopt
|
||||
: std::make_optional(extent_to_write_t(aligned_end, zero_extent_len))),
|
||||
bufferptr(ceph::buffer::create(zero_suffix_len, 0))
|
||||
(zero_suffix_len == 0
|
||||
? std::nullopt
|
||||
: std::make_optional(
|
||||
bufferptr(ceph::buffer::create(zero_suffix_len, 0))))
|
||||
);
|
||||
} else {
|
||||
return read_pin(ctx, pin->duplicate()
|
||||
@ -355,6 +398,80 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* get_zero_buffers
|
||||
*
|
||||
* Returns extent_to_write_t's reflecting a zero region extending
|
||||
* from offset~len with headptr optionally on the left and tailptr
|
||||
* optionally on the right.
|
||||
*/
|
||||
extent_to_write_list_t get_zero_buffers(
|
||||
const extent_len_t block_size,
|
||||
laddr_t offset, extent_len_t len,
|
||||
std::optional<bufferptr> &&headptr, std::optional<bufferptr> &&tailptr)
|
||||
{
|
||||
auto zero_left = p2roundup(offset, (laddr_t)block_size);
|
||||
auto zero_right = p2align(offset + len, (laddr_t)block_size);
|
||||
auto left = headptr ? (offset - headptr->length()) : offset;
|
||||
auto right = tailptr ?
|
||||
(offset + len + tailptr->length()) :
|
||||
(offset + len);
|
||||
|
||||
assert(
|
||||
(headptr && ((zero_left - left) ==
|
||||
p2roundup(headptr->length(), block_size))) ^
|
||||
(!headptr && (zero_left == left)));
|
||||
assert(
|
||||
(tailptr && ((right - zero_right) ==
|
||||
p2roundup(tailptr->length(), block_size))) ^
|
||||
(!tailptr && (right == zero_right)));
|
||||
|
||||
assert(right > left);
|
||||
assert((left % block_size) == 0);
|
||||
assert((right % block_size) == 0);
|
||||
|
||||
// zero region too small for a reserved section,
|
||||
// headptr and tailptr in same extent
|
||||
if (zero_right <= zero_left) {
|
||||
bufferlist bl;
|
||||
if (headptr) {
|
||||
bl.append(*headptr);
|
||||
}
|
||||
bl.append_zero(
|
||||
right - left - bl.length() - (tailptr ? tailptr->length() : 0));
|
||||
if (tailptr) {
|
||||
bl.append(*tailptr);
|
||||
}
|
||||
assert(bl.length() % block_size == 0);
|
||||
assert(bl.length() == (right - left));
|
||||
return {{left, bl}};
|
||||
} else {
|
||||
// reserved section between ends, headptr and tailptr in different extents
|
||||
extent_to_write_list_t ret;
|
||||
if (headptr) {
|
||||
bufferlist headbl;
|
||||
headbl.append(*headptr);
|
||||
headbl.append_zero(zero_left - left - headbl.length());
|
||||
assert(headbl.length() % block_size == 0);
|
||||
assert(headbl.length() > 0);
|
||||
ret.emplace_back(left, headbl);
|
||||
}
|
||||
// reserved zero region
|
||||
ret.emplace_back(zero_left, zero_right - zero_left);
|
||||
assert(ret.back().len % block_size == 0);
|
||||
assert(ret.back().len > 0);
|
||||
if (tailptr) {
|
||||
bufferlist tailbl;
|
||||
tailbl.append(*tailptr);
|
||||
tailbl.append_zero(right - zero_right - tailbl.length());
|
||||
assert(tailbl.length() % block_size == 0);
|
||||
assert(tailbl.length() > 0);
|
||||
ret.emplace_back(zero_right, tailbl);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* get_buffers
|
||||
*
|
||||
@ -372,61 +489,84 @@ extent_to_write_list_t get_buffers(laddr_t offset, bufferlist &bl)
|
||||
ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
|
||||
context_t ctx,
|
||||
laddr_t _offset,
|
||||
bufferlist &&bl,
|
||||
extent_len_t len,
|
||||
std::optional<bufferlist> &&bl,
|
||||
lba_pin_list_t &&_pins)
|
||||
{
|
||||
if (bl) {
|
||||
assert(bl->length() == len);
|
||||
}
|
||||
return seastar::do_with(
|
||||
_offset,
|
||||
std::move(bl),
|
||||
std::optional<bufferptr>(),
|
||||
std::move(_pins),
|
||||
extent_to_write_list_t(),
|
||||
[ctx](laddr_t &offset, auto &bl, auto &pins, auto &to_write) {
|
||||
[ctx, len](laddr_t &offset, auto &bl, auto &headptr,
|
||||
auto &pins, auto &to_write) {
|
||||
LOG_PREFIX(ObjectDataHandler::overwrite);
|
||||
DEBUGT("overwrite: {}~{}",
|
||||
ctx.t,
|
||||
offset,
|
||||
bl.length());
|
||||
len);
|
||||
ceph_assert(pins.size() >= 1);
|
||||
auto pin_begin = pins.front()->get_key();
|
||||
ceph_assert(pin_begin <= offset);
|
||||
auto pin_end = pins.back()->get_key() + pins.back()->get_length();
|
||||
ceph_assert(pin_end >= (offset + bl.length()));
|
||||
ceph_assert(pin_end >= (offset + len));
|
||||
|
||||
return split_pin_left(
|
||||
ctx,
|
||||
pins.front(),
|
||||
offset
|
||||
).si_then([ctx, pin_begin, &offset, &bl, &pins, &to_write](
|
||||
).si_then([ctx, len, pin_begin, &offset, &headptr, &pins, &to_write](
|
||||
auto p) {
|
||||
auto &[left_extent, headptr] = p;
|
||||
auto &[left_extent, _headptr] = p;
|
||||
if (left_extent) {
|
||||
ceph_assert(left_extent->addr == pin_begin);
|
||||
to_write.push_front(std::move(*left_extent));
|
||||
append_extent_to_write(to_write, std::move(*left_extent));
|
||||
}
|
||||
if (headptr) {
|
||||
bufferlist newbl;
|
||||
newbl.append(*headptr);
|
||||
newbl.append(bl);
|
||||
bl.swap(newbl);
|
||||
offset -= headptr->length();
|
||||
assert_aligned(offset);
|
||||
if (_headptr) {
|
||||
assert(_headptr->length() > 0);
|
||||
headptr = std::move(_headptr);
|
||||
}
|
||||
return split_pin_right(
|
||||
ctx,
|
||||
pins.back(),
|
||||
offset + bl.length());
|
||||
}).si_then([ctx, pin_end, &offset, &bl, &to_write](
|
||||
auto p) {
|
||||
offset + len);
|
||||
}).si_then([ctx, len, pin_begin, pin_end,
|
||||
&offset, &bl, &headptr, &to_write](auto p) {
|
||||
auto &[right_extent, tailptr] = p;
|
||||
if (tailptr) {
|
||||
bl.append(*tailptr);
|
||||
assert_aligned(bl.length());
|
||||
if (bl) {
|
||||
bufferlist write_bl;
|
||||
if (headptr) {
|
||||
write_bl.append(*headptr);
|
||||
offset -= headptr->length();
|
||||
assert_aligned(offset);
|
||||
}
|
||||
write_bl.claim_append(*bl);
|
||||
if (tailptr) {
|
||||
write_bl.append(*tailptr);
|
||||
assert_aligned(write_bl.length());
|
||||
}
|
||||
splice_extent_to_write(to_write, get_buffers(offset, write_bl));
|
||||
} else {
|
||||
splice_extent_to_write(
|
||||
to_write,
|
||||
get_zero_buffers(
|
||||
ctx.tm.get_block_size(),
|
||||
offset,
|
||||
len,
|
||||
std::move(headptr),
|
||||
std::move(tailptr)));
|
||||
}
|
||||
to_write.splice(to_write.end(), get_buffers(offset, bl));
|
||||
if (right_extent) {
|
||||
ceph_assert((right_extent->addr + right_extent->len) == pin_end);
|
||||
to_write.push_back(std::move(*right_extent));
|
||||
append_extent_to_write(to_write, std::move(*right_extent));
|
||||
}
|
||||
assert(to_write.size());
|
||||
assert(pin_begin == to_write.front().addr);
|
||||
assert(pin_end == (to_write.back().addr + to_write.back().len));
|
||||
return write_iertr::now();
|
||||
}).si_then([ctx, &pins] {
|
||||
return do_removals(ctx, pins);
|
||||
@ -436,6 +576,41 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
|
||||
});
|
||||
}
|
||||
|
||||
ObjectDataHandler::zero_ret ObjectDataHandler::zero(
|
||||
context_t ctx,
|
||||
objaddr_t offset,
|
||||
extent_len_t len)
|
||||
{
|
||||
return with_object_data(
|
||||
ctx,
|
||||
[this, ctx, offset, len](auto &object_data) {
|
||||
LOG_PREFIX(ObjectDataHandler::zero);
|
||||
DEBUGT("zero to {}~{}, object_data: {}~{}, is_null {}",
|
||||
ctx.t,
|
||||
offset,
|
||||
len,
|
||||
object_data.get_reserved_data_base(),
|
||||
object_data.get_reserved_data_len(),
|
||||
object_data.is_null());
|
||||
return prepare_data_reservation(
|
||||
ctx,
|
||||
object_data,
|
||||
p2roundup(offset + len, ctx.tm.get_block_size())
|
||||
).si_then([this, ctx, offset, len, &object_data] {
|
||||
auto logical_offset = object_data.get_reserved_data_base() + offset;
|
||||
return ctx.tm.get_pins(
|
||||
ctx.t,
|
||||
logical_offset,
|
||||
len
|
||||
).si_then([this, ctx, logical_offset, len](auto pins) {
|
||||
return overwrite(
|
||||
ctx, logical_offset, len,
|
||||
std::nullopt, std::move(pins));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
ObjectDataHandler::write_ret ObjectDataHandler::write(
|
||||
context_t ctx,
|
||||
objaddr_t offset,
|
||||
@ -464,7 +639,9 @@ ObjectDataHandler::write_ret ObjectDataHandler::write(
|
||||
bl.length()
|
||||
).si_then([this, ctx,logical_offset, &bl](
|
||||
auto pins) {
|
||||
return overwrite(ctx, logical_offset, bufferlist(bl), std::move(pins));
|
||||
return overwrite(
|
||||
ctx, logical_offset, bl.length(),
|
||||
bufferlist(bl), std::move(pins));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -66,6 +66,13 @@ public:
|
||||
objaddr_t offset,
|
||||
const bufferlist &bl);
|
||||
|
||||
using zero_iertr = base_iertr;
|
||||
using zero_ret = zero_iertr::future<>;
|
||||
zero_ret zero(
|
||||
context_t ctx,
|
||||
objaddr_t offset,
|
||||
extent_len_t len);
|
||||
|
||||
/// Reads data in [offset, offset + len)
|
||||
using read_iertr = base_iertr;
|
||||
using read_ret = read_iertr::future<bufferlist>;
|
||||
@ -99,7 +106,8 @@ private:
|
||||
write_ret overwrite(
|
||||
context_t ctx, ///< [in] ctx
|
||||
laddr_t offset, ///< [in] write offset
|
||||
bufferlist &&bl, ///< [in] buffer to write
|
||||
extent_len_t len, ///< [in] len to write, len == bl->length() if bl
|
||||
std::optional<bufferlist> &&bl, ///< [in] buffer to write, empty for zeros
|
||||
lba_pin_list_t &&pins ///< [in] set of pins overlapping above region
|
||||
);
|
||||
|
||||
|
@ -1167,6 +1167,12 @@ SeaStore::tm_ret SeaStore::_do_transaction_step(
|
||||
i.decode_bl(hint);
|
||||
return tm_iertr::now();
|
||||
}
|
||||
case Transaction::OP_ZERO:
|
||||
{
|
||||
objaddr_t off = op->off;
|
||||
extent_len_t len = op->len;
|
||||
return _zero(ctx, get_onode(op->oid), off, len);
|
||||
}
|
||||
default:
|
||||
ERROR("bad op {}", static_cast<unsigned>(op->op));
|
||||
return crimson::ct_error::input_output_error::make();
|
||||
@ -1225,6 +1231,33 @@ SeaStore::tm_ret SeaStore::_write(
|
||||
});
|
||||
}
|
||||
|
||||
SeaStore::tm_ret SeaStore::_zero(
|
||||
internal_context_t &ctx,
|
||||
OnodeRef &onode,
|
||||
objaddr_t offset,
|
||||
extent_len_t len)
|
||||
{
|
||||
LOG_PREFIX(SeaStore::_zero);
|
||||
DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
|
||||
if (offset + len >= max_object_size) {
|
||||
return crimson::ct_error::input_output_error::make();
|
||||
}
|
||||
auto &object_size = onode->get_mutable_layout(*ctx.transaction).size;
|
||||
object_size = std::max<uint64_t>(offset + len, object_size);
|
||||
return seastar::do_with(
|
||||
ObjectDataHandler(max_object_size),
|
||||
[=, &ctx, &onode](auto &objhandler) {
|
||||
return objhandler.zero(
|
||||
ObjectDataHandler::context_t{
|
||||
*transaction_manager,
|
||||
*ctx.transaction,
|
||||
*onode,
|
||||
},
|
||||
offset,
|
||||
len);
|
||||
});
|
||||
}
|
||||
|
||||
SeaStore::omap_set_kvs_ret
|
||||
SeaStore::_omap_set_kvs(
|
||||
OnodeRef &onode,
|
||||
|
@ -339,6 +339,10 @@ private:
|
||||
uint64_t offset, size_t len,
|
||||
ceph::bufferlist &&bl,
|
||||
uint32_t fadvise_flags);
|
||||
tm_ret _zero(
|
||||
internal_context_t &ctx,
|
||||
OnodeRef &onode,
|
||||
objaddr_t offset, extent_len_t len);
|
||||
tm_ret _omap_set_values(
|
||||
internal_context_t &ctx,
|
||||
OnodeRef &onode,
|
||||
|
@ -217,6 +217,53 @@ struct seastore_test_t :
|
||||
write(seastore, offset, bl);
|
||||
}
|
||||
|
||||
void zero(
|
||||
SeaStore &seastore,
|
||||
CTransaction &t,
|
||||
uint64_t offset,
|
||||
size_t len) {
|
||||
ceph::buffer::list bl;
|
||||
bl.append_zero(len);
|
||||
bufferlist new_contents;
|
||||
if (offset > 0 && contents.length()) {
|
||||
new_contents.substr_of(
|
||||
contents,
|
||||
0,
|
||||
std::min<size_t>(offset, contents.length())
|
||||
);
|
||||
}
|
||||
new_contents.append_zero(offset - new_contents.length());
|
||||
new_contents.append(bl);
|
||||
|
||||
auto tail_offset = offset + bl.length();
|
||||
if (contents.length() > tail_offset) {
|
||||
bufferlist tail;
|
||||
tail.substr_of(
|
||||
contents,
|
||||
tail_offset,
|
||||
contents.length() - tail_offset);
|
||||
new_contents.append(tail);
|
||||
}
|
||||
contents.swap(new_contents);
|
||||
|
||||
t.zero(
|
||||
cid,
|
||||
oid,
|
||||
offset,
|
||||
len);
|
||||
}
|
||||
|
||||
void zero(
|
||||
SeaStore &seastore,
|
||||
uint64_t offset,
|
||||
size_t len) {
|
||||
CTransaction t;
|
||||
zero(seastore, t, offset, len);
|
||||
seastore.do_transaction(
|
||||
coll,
|
||||
std::move(t)).get0();
|
||||
}
|
||||
|
||||
void read(
|
||||
SeaStore &seastore,
|
||||
uint64_t offset,
|
||||
@ -739,3 +786,61 @@ TEST_F(seastore_test_t, sparse_read)
|
||||
test_obj.remove(*seastore);
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(seastore_test_t, zero)
|
||||
{
|
||||
run_async([this] {
|
||||
auto test_zero = [this](
|
||||
// [(off, len, repeat)]
|
||||
std::vector<std::tuple<uint64_t, uint64_t, uint64_t>> writes,
|
||||
uint64_t zero_off, uint64_t zero_len) {
|
||||
|
||||
// Test zero within a block
|
||||
auto &test_obj = get_object(make_oid(0));
|
||||
uint64_t size = 0;
|
||||
for (auto &[off, len, repeat]: writes) {
|
||||
for (decltype(repeat) i = 0; i < repeat; ++i) {
|
||||
test_obj.write(*seastore, off + (len * repeat), len, 'a');
|
||||
}
|
||||
size = off + (len * (repeat + 1));
|
||||
}
|
||||
test_obj.read(
|
||||
*seastore,
|
||||
0,
|
||||
size);
|
||||
test_obj.check_size(*seastore);
|
||||
test_obj.zero(*seastore, zero_off, zero_len);
|
||||
test_obj.read(
|
||||
*seastore,
|
||||
0,
|
||||
size);
|
||||
test_obj.check_size(*seastore);
|
||||
remove_object(test_obj);
|
||||
};
|
||||
|
||||
const uint64_t BS = 4<<10;
|
||||
|
||||
// Test zero within a block
|
||||
test_zero(
|
||||
{{1<<10, 1<<10, 1}},
|
||||
1124, 200);
|
||||
|
||||
// Multiple writes, partial on left, partial on right.
|
||||
test_zero(
|
||||
{{BS, BS, 10}},
|
||||
BS + 128,
|
||||
BS * 4);
|
||||
|
||||
// Single large write, block boundary on right, partial on left.
|
||||
test_zero(
|
||||
{{BS, BS * 10, 1}},
|
||||
BS + 128,
|
||||
(BS * 4) - 128);
|
||||
|
||||
// Multiple writes, block boundary on left, partial on right.
|
||||
test_zero(
|
||||
{{BS, BS, 10}},
|
||||
BS,
|
||||
(BS * 4) + 128);
|
||||
});
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user