diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index e59ad3dee7e..ec0bd19cfbf 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -58,6 +58,43 @@ struct extent_to_write_t { }; using extent_to_write_list_t = std::list; +/** + * append_extent_to_write + * + * Appends passed extent_to_write_t maintaining invariant that the + * list may not contain consecutive zero elements by checking and + * combining them. + */ +void append_extent_to_write( + extent_to_write_list_t &to_write, extent_to_write_t &&to_append) +{ + assert( + to_write.empty() || + (to_write.back().addr + to_write.back().len) == to_append.addr); + if (to_write.empty() || to_write.back().to_write || to_append.to_write) { + to_write.push_back(std::move(to_append)); + } else { + to_write.back().len += to_append.len; + } +} + +/** + * splice_extent_to_write + * + * splices passed extent_to_write_list_t maintaining invariant that the + * list may not contain consecutive zero elements by checking and + * combining them. + */ +void splice_extent_to_write( + extent_to_write_list_t &to_write, extent_to_write_list_t &&to_splice) +{ + if (!to_splice.empty()) { + append_extent_to_write(to_write, std::move(to_splice.front())); + to_splice.pop_front(); + to_write.splice(to_write.end(), std::move(to_splice)); + } +} + /// Removes extents/mappings in pins ObjectDataHandler::write_ret do_removals( context_t ctx, @@ -177,7 +214,10 @@ split_ret split_pin_left(context_t ctx, LBAPinRef &pin, laddr_t offset) (zero_extent_len == 0 ? std::nullopt : std::make_optional(extent_to_write_t(pin_offset, zero_extent_len))), - bufferptr(ceph::buffer::create(zero_prepend_len, 0)) + (zero_prepend_len == 0 + ? std::nullopt + : std::make_optional( + bufferptr(ceph::buffer::create(zero_prepend_len, 0)))) ); } else { // Data, return up to offset to prepend @@ -213,7 +253,10 @@ split_ret split_pin_right(context_t ctx, LBAPinRef &pin, laddr_t end) (zero_extent_len == 0 ? std::nullopt : std::make_optional(extent_to_write_t(aligned_end, zero_extent_len))), - bufferptr(ceph::buffer::create(zero_suffix_len, 0)) + (zero_suffix_len == 0 + ? std::nullopt + : std::make_optional( + bufferptr(ceph::buffer::create(zero_suffix_len, 0)))) ); } else { return read_pin(ctx, pin->duplicate() @@ -355,6 +398,80 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation( }); } +/** + * get_zero_buffers + * + * Returns extent_to_write_t's reflecting a zero region extending + * from offset~len with headptr optionally on the left and tailptr + * optionally on the right. + */ +extent_to_write_list_t get_zero_buffers( + const extent_len_t block_size, + laddr_t offset, extent_len_t len, + std::optional &&headptr, std::optional &&tailptr) +{ + auto zero_left = p2roundup(offset, (laddr_t)block_size); + auto zero_right = p2align(offset + len, (laddr_t)block_size); + auto left = headptr ? (offset - headptr->length()) : offset; + auto right = tailptr ? + (offset + len + tailptr->length()) : + (offset + len); + + assert( + (headptr && ((zero_left - left) == + p2roundup(headptr->length(), block_size))) ^ + (!headptr && (zero_left == left))); + assert( + (tailptr && ((right - zero_right) == + p2roundup(tailptr->length(), block_size))) ^ + (!tailptr && (right == zero_right))); + + assert(right > left); + assert((left % block_size) == 0); + assert((right % block_size) == 0); + + // zero region too small for a reserved section, + // headptr and tailptr in same extent + if (zero_right <= zero_left) { + bufferlist bl; + if (headptr) { + bl.append(*headptr); + } + bl.append_zero( + right - left - bl.length() - (tailptr ? tailptr->length() : 0)); + if (tailptr) { + bl.append(*tailptr); + } + assert(bl.length() % block_size == 0); + assert(bl.length() == (right - left)); + return {{left, bl}}; + } else { + // reserved section between ends, headptr and tailptr in different extents + extent_to_write_list_t ret; + if (headptr) { + bufferlist headbl; + headbl.append(*headptr); + headbl.append_zero(zero_left - left - headbl.length()); + assert(headbl.length() % block_size == 0); + assert(headbl.length() > 0); + ret.emplace_back(left, headbl); + } + // reserved zero region + ret.emplace_back(zero_left, zero_right - zero_left); + assert(ret.back().len % block_size == 0); + assert(ret.back().len > 0); + if (tailptr) { + bufferlist tailbl; + tailbl.append(*tailptr); + tailbl.append_zero(right - zero_right - tailbl.length()); + assert(tailbl.length() % block_size == 0); + assert(tailbl.length() > 0); + ret.emplace_back(zero_right, tailbl); + } + return ret; + } +} + /** * get_buffers * @@ -372,61 +489,84 @@ extent_to_write_list_t get_buffers(laddr_t offset, bufferlist &bl) ObjectDataHandler::write_ret ObjectDataHandler::overwrite( context_t ctx, laddr_t _offset, - bufferlist &&bl, + extent_len_t len, + std::optional &&bl, lba_pin_list_t &&_pins) { + if (bl) { + assert(bl->length() == len); + } return seastar::do_with( _offset, std::move(bl), + std::optional(), std::move(_pins), extent_to_write_list_t(), - [ctx](laddr_t &offset, auto &bl, auto &pins, auto &to_write) { + [ctx, len](laddr_t &offset, auto &bl, auto &headptr, + auto &pins, auto &to_write) { LOG_PREFIX(ObjectDataHandler::overwrite); DEBUGT("overwrite: {}~{}", ctx.t, offset, - bl.length()); + len); ceph_assert(pins.size() >= 1); auto pin_begin = pins.front()->get_key(); ceph_assert(pin_begin <= offset); auto pin_end = pins.back()->get_key() + pins.back()->get_length(); - ceph_assert(pin_end >= (offset + bl.length())); + ceph_assert(pin_end >= (offset + len)); return split_pin_left( ctx, pins.front(), offset - ).si_then([ctx, pin_begin, &offset, &bl, &pins, &to_write]( + ).si_then([ctx, len, pin_begin, &offset, &headptr, &pins, &to_write]( auto p) { - auto &[left_extent, headptr] = p; + auto &[left_extent, _headptr] = p; if (left_extent) { ceph_assert(left_extent->addr == pin_begin); - to_write.push_front(std::move(*left_extent)); + append_extent_to_write(to_write, std::move(*left_extent)); } - if (headptr) { - bufferlist newbl; - newbl.append(*headptr); - newbl.append(bl); - bl.swap(newbl); - offset -= headptr->length(); - assert_aligned(offset); + if (_headptr) { + assert(_headptr->length() > 0); + headptr = std::move(_headptr); } return split_pin_right( ctx, pins.back(), - offset + bl.length()); - }).si_then([ctx, pin_end, &offset, &bl, &to_write]( - auto p) { + offset + len); + }).si_then([ctx, len, pin_begin, pin_end, + &offset, &bl, &headptr, &to_write](auto p) { auto &[right_extent, tailptr] = p; - if (tailptr) { - bl.append(*tailptr); - assert_aligned(bl.length()); + if (bl) { + bufferlist write_bl; + if (headptr) { + write_bl.append(*headptr); + offset -= headptr->length(); + assert_aligned(offset); + } + write_bl.claim_append(*bl); + if (tailptr) { + write_bl.append(*tailptr); + assert_aligned(write_bl.length()); + } + splice_extent_to_write(to_write, get_buffers(offset, write_bl)); + } else { + splice_extent_to_write( + to_write, + get_zero_buffers( + ctx.tm.get_block_size(), + offset, + len, + std::move(headptr), + std::move(tailptr))); } - to_write.splice(to_write.end(), get_buffers(offset, bl)); if (right_extent) { ceph_assert((right_extent->addr + right_extent->len) == pin_end); - to_write.push_back(std::move(*right_extent)); + append_extent_to_write(to_write, std::move(*right_extent)); } + assert(to_write.size()); + assert(pin_begin == to_write.front().addr); + assert(pin_end == (to_write.back().addr + to_write.back().len)); return write_iertr::now(); }).si_then([ctx, &pins] { return do_removals(ctx, pins); @@ -436,6 +576,41 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite( }); } +ObjectDataHandler::zero_ret ObjectDataHandler::zero( + context_t ctx, + objaddr_t offset, + extent_len_t len) +{ + return with_object_data( + ctx, + [this, ctx, offset, len](auto &object_data) { + LOG_PREFIX(ObjectDataHandler::zero); + DEBUGT("zero to {}~{}, object_data: {}~{}, is_null {}", + ctx.t, + offset, + len, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len(), + object_data.is_null()); + return prepare_data_reservation( + ctx, + object_data, + p2roundup(offset + len, ctx.tm.get_block_size()) + ).si_then([this, ctx, offset, len, &object_data] { + auto logical_offset = object_data.get_reserved_data_base() + offset; + return ctx.tm.get_pins( + ctx.t, + logical_offset, + len + ).si_then([this, ctx, logical_offset, len](auto pins) { + return overwrite( + ctx, logical_offset, len, + std::nullopt, std::move(pins)); + }); + }); + }); +} + ObjectDataHandler::write_ret ObjectDataHandler::write( context_t ctx, objaddr_t offset, @@ -464,7 +639,9 @@ ObjectDataHandler::write_ret ObjectDataHandler::write( bl.length() ).si_then([this, ctx,logical_offset, &bl]( auto pins) { - return overwrite(ctx, logical_offset, bufferlist(bl), std::move(pins)); + return overwrite( + ctx, logical_offset, bl.length(), + bufferlist(bl), std::move(pins)); }); }); }); diff --git a/src/crimson/os/seastore/object_data_handler.h b/src/crimson/os/seastore/object_data_handler.h index dd91f343623..ad6bc414131 100644 --- a/src/crimson/os/seastore/object_data_handler.h +++ b/src/crimson/os/seastore/object_data_handler.h @@ -66,6 +66,13 @@ public: objaddr_t offset, const bufferlist &bl); + using zero_iertr = base_iertr; + using zero_ret = zero_iertr::future<>; + zero_ret zero( + context_t ctx, + objaddr_t offset, + extent_len_t len); + /// Reads data in [offset, offset + len) using read_iertr = base_iertr; using read_ret = read_iertr::future; @@ -99,7 +106,8 @@ private: write_ret overwrite( context_t ctx, ///< [in] ctx laddr_t offset, ///< [in] write offset - bufferlist &&bl, ///< [in] buffer to write + extent_len_t len, ///< [in] len to write, len == bl->length() if bl + std::optional &&bl, ///< [in] buffer to write, empty for zeros lba_pin_list_t &&pins ///< [in] set of pins overlapping above region ); diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index 424c0ea8ee9..ad05bf56cb6 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -1167,6 +1167,12 @@ SeaStore::tm_ret SeaStore::_do_transaction_step( i.decode_bl(hint); return tm_iertr::now(); } + case Transaction::OP_ZERO: + { + objaddr_t off = op->off; + extent_len_t len = op->len; + return _zero(ctx, get_onode(op->oid), off, len); + } default: ERROR("bad op {}", static_cast(op->op)); return crimson::ct_error::input_output_error::make(); @@ -1225,6 +1231,33 @@ SeaStore::tm_ret SeaStore::_write( }); } +SeaStore::tm_ret SeaStore::_zero( + internal_context_t &ctx, + OnodeRef &onode, + objaddr_t offset, + extent_len_t len) +{ + LOG_PREFIX(SeaStore::_zero); + DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len); + if (offset + len >= max_object_size) { + return crimson::ct_error::input_output_error::make(); + } + auto &object_size = onode->get_mutable_layout(*ctx.transaction).size; + object_size = std::max(offset + len, object_size); + return seastar::do_with( + ObjectDataHandler(max_object_size), + [=, &ctx, &onode](auto &objhandler) { + return objhandler.zero( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + *onode, + }, + offset, + len); + }); +} + SeaStore::omap_set_kvs_ret SeaStore::_omap_set_kvs( OnodeRef &onode, diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index d75b07a06de..f618a746a06 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -339,6 +339,10 @@ private: uint64_t offset, size_t len, ceph::bufferlist &&bl, uint32_t fadvise_flags); + tm_ret _zero( + internal_context_t &ctx, + OnodeRef &onode, + objaddr_t offset, extent_len_t len); tm_ret _omap_set_values( internal_context_t &ctx, OnodeRef &onode, diff --git a/src/test/crimson/seastore/test_seastore.cc b/src/test/crimson/seastore/test_seastore.cc index 65cced2e030..c6471998d9c 100644 --- a/src/test/crimson/seastore/test_seastore.cc +++ b/src/test/crimson/seastore/test_seastore.cc @@ -217,6 +217,53 @@ struct seastore_test_t : write(seastore, offset, bl); } + void zero( + SeaStore &seastore, + CTransaction &t, + uint64_t offset, + size_t len) { + ceph::buffer::list bl; + bl.append_zero(len); + bufferlist new_contents; + if (offset > 0 && contents.length()) { + new_contents.substr_of( + contents, + 0, + std::min(offset, contents.length()) + ); + } + new_contents.append_zero(offset - new_contents.length()); + new_contents.append(bl); + + auto tail_offset = offset + bl.length(); + if (contents.length() > tail_offset) { + bufferlist tail; + tail.substr_of( + contents, + tail_offset, + contents.length() - tail_offset); + new_contents.append(tail); + } + contents.swap(new_contents); + + t.zero( + cid, + oid, + offset, + len); + } + + void zero( + SeaStore &seastore, + uint64_t offset, + size_t len) { + CTransaction t; + zero(seastore, t, offset, len); + seastore.do_transaction( + coll, + std::move(t)).get0(); + } + void read( SeaStore &seastore, uint64_t offset, @@ -739,3 +786,61 @@ TEST_F(seastore_test_t, sparse_read) test_obj.remove(*seastore); }); } + +TEST_F(seastore_test_t, zero) +{ + run_async([this] { + auto test_zero = [this]( + // [(off, len, repeat)] + std::vector> writes, + uint64_t zero_off, uint64_t zero_len) { + + // Test zero within a block + auto &test_obj = get_object(make_oid(0)); + uint64_t size = 0; + for (auto &[off, len, repeat]: writes) { + for (decltype(repeat) i = 0; i < repeat; ++i) { + test_obj.write(*seastore, off + (len * repeat), len, 'a'); + } + size = off + (len * (repeat + 1)); + } + test_obj.read( + *seastore, + 0, + size); + test_obj.check_size(*seastore); + test_obj.zero(*seastore, zero_off, zero_len); + test_obj.read( + *seastore, + 0, + size); + test_obj.check_size(*seastore); + remove_object(test_obj); + }; + + const uint64_t BS = 4<<10; + + // Test zero within a block + test_zero( + {{1<<10, 1<<10, 1}}, + 1124, 200); + + // Multiple writes, partial on left, partial on right. + test_zero( + {{BS, BS, 10}}, + BS + 128, + BS * 4); + + // Single large write, block boundary on right, partial on left. + test_zero( + {{BS, BS * 10, 1}}, + BS + 128, + (BS * 4) - 128); + + // Multiple writes, block boundary on left, partial on right. + test_zero( + {{BS, BS, 10}}, + BS, + (BS * 4) + 128); + }); +}