mirror of
https://github.com/ceph/ceph
synced 2025-03-11 02:39:05 +00:00
osd/: add ExtentCache
Signed-off-by: Samuel Just <sjust@redhat.com>
This commit is contained in:
parent
050c9010e2
commit
5e49a69cec
@ -21,6 +21,7 @@ set(osd_srcs
|
||||
ScrubStore.cc
|
||||
osd_types.cc
|
||||
ECUtil.cc
|
||||
ExtentCache.cc
|
||||
${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc
|
||||
${osdc_osd_srcs})
|
||||
if(HAS_VTA)
|
||||
|
238
src/osd/ExtentCache.cc
Normal file
238
src/osd/ExtentCache.cc
Normal file
@ -0,0 +1,238 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2016 Red Hat
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ExtentCache.h"
|
||||
|
||||
void ExtentCache::extent::_link_pin_state(pin_state &pin_state)
|
||||
{
|
||||
assert(parent_extent_set);
|
||||
assert(!parent_pin_state);
|
||||
parent_pin_state = &pin_state;
|
||||
pin_state.pin_list.push_back(*this);
|
||||
}
|
||||
|
||||
void ExtentCache::extent::_unlink_pin_state()
|
||||
{
|
||||
assert(parent_extent_set);
|
||||
assert(parent_pin_state);
|
||||
auto liter = pin_state::list::s_iterator_to(*this);
|
||||
parent_pin_state->pin_list.erase(liter);
|
||||
parent_pin_state = nullptr;
|
||||
}
|
||||
|
||||
void ExtentCache::extent::unlink()
|
||||
{
|
||||
assert(parent_extent_set);
|
||||
assert(parent_pin_state);
|
||||
|
||||
_unlink_pin_state();
|
||||
|
||||
// remove from extent set
|
||||
{
|
||||
auto siter = object_extent_set::set::s_iterator_to(*this);
|
||||
auto &set = object_extent_set::set::container_from_iterator(siter);
|
||||
assert(&set == &(parent_extent_set->extent_set));
|
||||
set.erase(siter);
|
||||
}
|
||||
|
||||
parent_extent_set = nullptr;
|
||||
assert(!parent_pin_state);
|
||||
}
|
||||
|
||||
void ExtentCache::extent::link(
|
||||
object_extent_set &extent_set,
|
||||
pin_state &pin_state)
|
||||
{
|
||||
assert(!parent_extent_set);
|
||||
parent_extent_set = &extent_set;
|
||||
extent_set.extent_set.insert(*this);
|
||||
|
||||
_link_pin_state(pin_state);
|
||||
}
|
||||
|
||||
void ExtentCache::extent::move(
|
||||
pin_state &to)
|
||||
{
|
||||
_unlink_pin_state();
|
||||
_link_pin_state(to);
|
||||
}
|
||||
|
||||
void ExtentCache::remove_and_destroy_if_empty(object_extent_set &eset)
|
||||
{
|
||||
if (eset.extent_set.empty()) {
|
||||
auto siter = cache_set::s_iterator_to(eset);
|
||||
auto &set = cache_set::container_from_iterator(siter);
|
||||
assert(&set == &per_object_caches);
|
||||
|
||||
// per_object_caches owns eset
|
||||
per_object_caches.erase(eset);
|
||||
delete &eset;
|
||||
}
|
||||
}
|
||||
|
||||
ExtentCache::object_extent_set &ExtentCache::get_or_create(
|
||||
const hobject_t &oid)
|
||||
{
|
||||
cache_set::insert_commit_data data;
|
||||
auto p = per_object_caches.insert_check(oid, Cmp(), data);
|
||||
if (p.second) {
|
||||
auto *eset = new object_extent_set(oid);
|
||||
per_object_caches.insert_commit(*eset, data);
|
||||
return *eset;
|
||||
} else {
|
||||
return *(p.first);
|
||||
}
|
||||
}
|
||||
|
||||
ExtentCache::object_extent_set *ExtentCache::get_if_exists(
|
||||
const hobject_t &oid)
|
||||
{
|
||||
cache_set::insert_commit_data data;
|
||||
auto p = per_object_caches.insert_check(oid, Cmp(), data);
|
||||
if (p.second) {
|
||||
return nullptr;
|
||||
} else {
|
||||
return &*(p.first);
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<
|
||||
ExtentCache::object_extent_set::set::iterator,
|
||||
ExtentCache::object_extent_set::set::iterator
|
||||
> ExtentCache::object_extent_set::get_containing_range(
|
||||
uint64_t off, uint64_t len)
|
||||
{
|
||||
// fst is first iterator with end after off (may be end)
|
||||
auto fst = extent_set.upper_bound(off, uint_cmp());
|
||||
if (fst != extent_set.begin())
|
||||
--fst;
|
||||
if (fst != extent_set.end() && off >= (fst->offset + fst->get_length()))
|
||||
++fst;
|
||||
|
||||
// lst is first iterator with start >= off + len (may be end)
|
||||
auto lst = extent_set.lower_bound(off + len, uint_cmp());
|
||||
return std::make_pair(fst, lst);
|
||||
}
|
||||
|
||||
extent_set ExtentCache::reserve_extents_for_rmw(
|
||||
const hobject_t &oid,
|
||||
write_pin &pin,
|
||||
const extent_set &to_write,
|
||||
const extent_set &to_read)
|
||||
{
|
||||
if (to_write.empty() && to_read.empty()) {
|
||||
return extent_set();
|
||||
}
|
||||
extent_set must_read;
|
||||
auto &eset = get_or_create(oid);
|
||||
extent_set missing;
|
||||
for (auto &&res: to_write) {
|
||||
eset.traverse_update(
|
||||
pin,
|
||||
res.first,
|
||||
res.second,
|
||||
[&](uint64_t off, uint64_t len,
|
||||
extent *ext, object_extent_set::update_action *action) {
|
||||
action->action = object_extent_set::update_action::UPDATE_PIN;
|
||||
if (!ext) {
|
||||
missing.insert(off, len);
|
||||
}
|
||||
});
|
||||
}
|
||||
must_read.intersection_of(
|
||||
to_read,
|
||||
missing);
|
||||
return must_read;
|
||||
}
|
||||
|
||||
extent_map ExtentCache::get_remaining_extents_for_rmw(
|
||||
const hobject_t &oid,
|
||||
write_pin &pin,
|
||||
const extent_set &to_get)
|
||||
{
|
||||
if (to_get.empty()) {
|
||||
return extent_map();
|
||||
}
|
||||
extent_map ret;
|
||||
auto &eset = get_or_create(oid);
|
||||
for (auto &&res: to_get) {
|
||||
bufferlist bl;
|
||||
eset.traverse_update(
|
||||
pin,
|
||||
res.first,
|
||||
res.second,
|
||||
[&](uint64_t off, uint64_t len,
|
||||
extent *ext, object_extent_set::update_action *action) {
|
||||
action->action = object_extent_set::update_action::NONE;
|
||||
assert(ext && ext->bl && ext->pinned_by_write());
|
||||
bl.substr_of(
|
||||
*(ext->bl),
|
||||
off - ext->offset,
|
||||
len);
|
||||
});
|
||||
ret.insert(res.first, res.second, bl);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ExtentCache::present_rmw_update(
|
||||
const hobject_t &oid,
|
||||
write_pin &pin,
|
||||
const extent_map &extents)
|
||||
{
|
||||
if (extents.empty()) {
|
||||
return;
|
||||
}
|
||||
auto &eset = get_or_create(oid);
|
||||
for (auto &&res: extents) {
|
||||
eset.traverse_update(
|
||||
pin,
|
||||
res.get_off(),
|
||||
res.get_len(),
|
||||
[&](uint64_t off, uint64_t len,
|
||||
extent *ext, object_extent_set::update_action *action) {
|
||||
action->action = object_extent_set::update_action::NONE;
|
||||
assert(ext && ext->pinned_by_write());
|
||||
action->bl = bufferlist();
|
||||
action->bl->substr_of(
|
||||
res.get_val(),
|
||||
off - res.get_off(),
|
||||
len);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
ostream &ExtentCache::print(ostream &out) const
|
||||
{
|
||||
out << "ExtentCache(" << std::endl;
|
||||
for (auto esiter = per_object_caches.begin();
|
||||
esiter != per_object_caches.end();
|
||||
++esiter) {
|
||||
out << " Extents(" << esiter->oid << ")[" << std::endl;
|
||||
for (auto exiter = esiter->extent_set.begin();
|
||||
exiter != esiter->extent_set.end();
|
||||
++exiter) {
|
||||
out << " Extent(" << exiter->offset
|
||||
<< "~" << exiter->get_length()
|
||||
<< ":" << exiter->pin_tid()
|
||||
<< ")" << std::endl;
|
||||
}
|
||||
}
|
||||
return out << ")" << std::endl;
|
||||
}
|
||||
|
||||
ostream &operator<<(ostream &lhs, const ExtentCache &cache)
|
||||
{
|
||||
return cache.print(lhs);
|
||||
}
|
491
src/osd/ExtentCache.h
Normal file
491
src/osd/ExtentCache.h
Normal file
@ -0,0 +1,491 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2016 Red Hat
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef EXTENT_CACHE_H
|
||||
#define EXTENT_CACHE_H
|
||||
|
||||
#include <map>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <boost/optional.hpp>
|
||||
#include <boost/intrusive/set.hpp>
|
||||
#include <boost/intrusive/list.hpp>
|
||||
#include "include/interval_set.h"
|
||||
#include "common/interval_map.h"
|
||||
#include "include/buffer.h"
|
||||
#include "common/hobject.h"
|
||||
|
||||
/**
|
||||
ExtentCache
|
||||
|
||||
The main purpose of this cache is to ensure that we can pipeline
|
||||
overlapping partial overwrites.
|
||||
|
||||
To that end we need to ensure that an extent pinned for an operation is
|
||||
live until that operation completes. However, a particular extent
|
||||
might be pinned by multiple operations (several pipelined writes
|
||||
on the same object).
|
||||
|
||||
1) When we complete an operation, we only look at extents owned only
|
||||
by that operation.
|
||||
2) Per-extent overhead is fixed size.
|
||||
2) Per-operation metadata is fixed size.
|
||||
|
||||
This is simple enough to realize with two main structures:
|
||||
- extent: contains a pointer to the pin owning it and intrusive list
|
||||
pointers to other extents owned by the same pin
|
||||
- pin_state: contains the list head for extents owned by it
|
||||
|
||||
This works as long as we only need to remember one "owner" for
|
||||
each extent. To make this work, we'll need to leverage some
|
||||
invariants guaranteed by higher layers:
|
||||
|
||||
1) Writes on a particular object must be ordered
|
||||
2) A particular object will have outstanding reads or writes, but not
|
||||
both (note that you can have a read while a write is committed, but
|
||||
not applied).
|
||||
|
||||
Our strategy therefore will be to have whichever in-progress op will
|
||||
finish "last" be the owner of a particular extent. For now, we won't
|
||||
cache reads, so 2) simply means that we can assume that reads and
|
||||
recovery operations imply no unstable extents on the object in
|
||||
question.
|
||||
|
||||
Write: WaitRead -> WaitCommit -> Complete
|
||||
|
||||
Invariant 1) above actually indicates that we can't have writes
|
||||
bypassing the WaitRead state while there are writes waiting on
|
||||
Reads. Thus, the set of operations pinning a particular extent
|
||||
must always complete in order or arrival.
|
||||
|
||||
This suggests that a particular extent may be in only the following
|
||||
states:
|
||||
|
||||
|
||||
0) Empty (not in the map at all)
|
||||
1) Write Pending N
|
||||
- Some write with reqid <= N is currently fetching the data for
|
||||
this extent
|
||||
- The extent must persist until Write reqid N completes
|
||||
- All ops pinning this extent are writes in the WaitRead state of
|
||||
the Write pipeline (there must be an in progress write, so no
|
||||
reads can be in progress).
|
||||
2) Write Pinned N:
|
||||
- This extent has data corresponding to some reqid M <= N
|
||||
- The extent must persist until Write reqid N commits
|
||||
- All ops pinning this extent are writes in some Write
|
||||
state (all are possible). Reads are not possible
|
||||
in this state (or the others) due to 2).
|
||||
|
||||
All of the above suggests that there are 3 things users can
|
||||
ask of the cache corresponding to the 3 Write pipelines
|
||||
states.
|
||||
*/
|
||||
|
||||
/// If someone wants these types, but not ExtentCache, move to another file
|
||||
struct bl_split_merge {
|
||||
bufferlist split(
|
||||
uint64_t offset,
|
||||
uint64_t length,
|
||||
bufferlist &bl) const {
|
||||
bufferlist out;
|
||||
out.substr_of(bl, offset, length);
|
||||
return out;
|
||||
}
|
||||
bool can_merge(const bufferlist &left, const bufferlist &right) const {
|
||||
return true;
|
||||
}
|
||||
bufferlist merge(bufferlist &&left, bufferlist &&right) const {
|
||||
bufferlist bl;
|
||||
bl.claim(left);
|
||||
bl.claim_append(right);
|
||||
return bl;
|
||||
}
|
||||
uint64_t length(const bufferlist &b) const { return b.length(); }
|
||||
};
|
||||
using extent_set = interval_set<uint64_t>;
|
||||
using extent_map = interval_map<uint64_t, bufferlist, bl_split_merge>;
|
||||
|
||||
class ExtentCache {
|
||||
struct object_extent_set;
|
||||
struct pin_state;
|
||||
private:
|
||||
|
||||
struct extent {
|
||||
object_extent_set *parent_extent_set = nullptr;
|
||||
pin_state *parent_pin_state = nullptr;
|
||||
boost::intrusive::set_member_hook<> extent_set_member;
|
||||
boost::intrusive::list_member_hook<> pin_list_member;
|
||||
|
||||
uint64_t offset;
|
||||
uint64_t length;
|
||||
boost::optional<bufferlist> bl;
|
||||
|
||||
uint64_t get_length() const {
|
||||
return length;
|
||||
}
|
||||
|
||||
bool is_pending() const {
|
||||
return bl == boost::none;
|
||||
}
|
||||
|
||||
bool pinned_by_write() const {
|
||||
assert(parent_pin_state);
|
||||
return parent_pin_state->is_write();
|
||||
}
|
||||
|
||||
uint64_t pin_tid() const {
|
||||
assert(parent_pin_state);
|
||||
return parent_pin_state->tid;
|
||||
}
|
||||
|
||||
extent(uint64_t offset, bufferlist _bl)
|
||||
: offset(offset), length(_bl.length()), bl(_bl) {}
|
||||
|
||||
extent(uint64_t offset, uint64_t length)
|
||||
: offset(offset), length(length) {}
|
||||
|
||||
bool operator<(const extent &rhs) const {
|
||||
return offset < rhs.offset;
|
||||
}
|
||||
private:
|
||||
// can briefly violate the two link invariant, used in unlink() and move()
|
||||
void _link_pin_state(pin_state &pin_state);
|
||||
void _unlink_pin_state();
|
||||
public:
|
||||
void unlink();
|
||||
void link(object_extent_set &parent_extent_set, pin_state &pin_state);
|
||||
void move(pin_state &to);
|
||||
};
|
||||
|
||||
struct object_extent_set : boost::intrusive::set_base_hook<> {
|
||||
hobject_t oid;
|
||||
object_extent_set(const hobject_t &oid) : oid(oid) {}
|
||||
|
||||
using set_member_options = boost::intrusive::member_hook<
|
||||
extent,
|
||||
boost::intrusive::set_member_hook<>,
|
||||
&extent::extent_set_member>;
|
||||
using set = boost::intrusive::set<extent, set_member_options>;
|
||||
set extent_set;
|
||||
|
||||
bool operator<(const object_extent_set &rhs) const {
|
||||
return cmp_bitwise(oid, rhs.oid) < 0;
|
||||
}
|
||||
|
||||
struct uint_cmp {
|
||||
bool operator()(uint64_t lhs, const extent &rhs) const {
|
||||
return lhs < rhs.offset;
|
||||
}
|
||||
bool operator()(const extent &lhs, uint64_t rhs) const {
|
||||
return lhs.offset < rhs;
|
||||
}
|
||||
};
|
||||
std::pair<set::iterator, set::iterator> get_containing_range(
|
||||
uint64_t offset, uint64_t length);
|
||||
|
||||
void erase(uint64_t offset, uint64_t length);
|
||||
|
||||
struct update_action {
|
||||
enum type {
|
||||
NONE,
|
||||
UPDATE_PIN
|
||||
};
|
||||
type action = NONE;
|
||||
boost::optional<bufferlist> bl;
|
||||
};
|
||||
template <typename F>
|
||||
void traverse_update(
|
||||
pin_state &pin,
|
||||
uint64_t offset,
|
||||
uint64_t length,
|
||||
F &&f) {
|
||||
auto range = get_containing_range(offset, length);
|
||||
|
||||
if (range.first == range.second || range.first->offset > offset) {
|
||||
uint64_t extlen = range.first == range.second ?
|
||||
length : range.first->offset - offset;
|
||||
|
||||
update_action action;
|
||||
f(offset, extlen, nullptr, &action);
|
||||
assert(!action.bl || action.bl->length() == extlen);
|
||||
if (action.action == update_action::UPDATE_PIN) {
|
||||
extent *ext = action.bl ?
|
||||
new extent(offset, *action.bl) :
|
||||
new extent(offset, extlen);
|
||||
ext->link(*this, pin);
|
||||
} else {
|
||||
assert(!action.bl);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto p = range.first; p != range.second;) {
|
||||
extent *ext = &*p;
|
||||
++p;
|
||||
|
||||
uint64_t extoff = MAX(ext->offset, offset);
|
||||
uint64_t extlen = MIN(
|
||||
ext->length - (extoff - ext->offset),
|
||||
offset + length - extoff);
|
||||
|
||||
update_action action;
|
||||
f(extoff, extlen, ext, &action);
|
||||
assert(!action.bl || action.bl->length() == extlen);
|
||||
extent *final_extent = nullptr;
|
||||
if (action.action == update_action::NONE) {
|
||||
final_extent = ext;
|
||||
} else {
|
||||
pin_state *ps = ext->parent_pin_state;
|
||||
ext->unlink();
|
||||
if ((ext->offset < offset) &&
|
||||
(ext->offset + ext->get_length() > offset)) {
|
||||
extent *head = nullptr;
|
||||
if (ext->bl) {
|
||||
bufferlist bl;
|
||||
bl.substr_of(
|
||||
*(ext->bl),
|
||||
0,
|
||||
offset - ext->offset);
|
||||
head = new extent(ext->offset, bl);
|
||||
} else {
|
||||
head = new extent(
|
||||
ext->offset, offset - ext->offset);
|
||||
}
|
||||
head->link(*this, *ps);
|
||||
}
|
||||
if ((ext->offset + ext->length > offset + length) &&
|
||||
(offset + length > ext->offset)) {
|
||||
uint64_t nlen =
|
||||
(ext->offset + ext->get_length()) - (offset + length);
|
||||
extent *tail = nullptr;
|
||||
if (ext->bl) {
|
||||
bufferlist bl;
|
||||
bl.substr_of(
|
||||
*(ext->bl),
|
||||
ext->get_length() - nlen,
|
||||
nlen);
|
||||
tail = new extent(offset + length, bl);
|
||||
} else {
|
||||
tail = new extent(offset + length, nlen);
|
||||
}
|
||||
tail->link(*this, *ps);
|
||||
}
|
||||
if (action.action == update_action::UPDATE_PIN) {
|
||||
if (ext->bl) {
|
||||
bufferlist bl;
|
||||
bl.substr_of(
|
||||
*(ext->bl),
|
||||
extoff - ext->offset,
|
||||
extlen);
|
||||
final_extent = new ExtentCache::extent(
|
||||
extoff,
|
||||
bl);
|
||||
} else {
|
||||
final_extent = new ExtentCache::extent(
|
||||
extoff, extlen);
|
||||
}
|
||||
final_extent->link(*this, pin);
|
||||
}
|
||||
delete ext;
|
||||
}
|
||||
|
||||
if (action.bl) {
|
||||
assert(final_extent);
|
||||
assert(final_extent->length == action.bl->length());
|
||||
final_extent->bl = *(action.bl);
|
||||
}
|
||||
|
||||
uint64_t next_off = p == range.second ?
|
||||
offset + length : p->offset;
|
||||
if (extoff + extlen < next_off) {
|
||||
uint64_t tailoff = extoff + extlen;
|
||||
uint64_t taillen = next_off - tailoff;
|
||||
|
||||
update_action action;
|
||||
f(tailoff, taillen, nullptr, &action);
|
||||
assert(!action.bl || action.bl->length() == taillen);
|
||||
if (action.action == update_action::UPDATE_PIN) {
|
||||
extent *ext = action.bl ?
|
||||
new extent(tailoff, *action.bl) :
|
||||
new extent(tailoff, taillen);
|
||||
ext->link(*this, pin);
|
||||
} else {
|
||||
assert(!action.bl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
struct Cmp {
|
||||
bool operator()(const hobject_t &oid, const object_extent_set &rhs) const {
|
||||
return cmp_bitwise(oid, rhs.oid) < 0;
|
||||
}
|
||||
bool operator()(const object_extent_set &lhs, const hobject_t &oid) const {
|
||||
return cmp_bitwise(lhs.oid, oid) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
object_extent_set &get_or_create(const hobject_t &oid);
|
||||
object_extent_set *get_if_exists(const hobject_t &oid);
|
||||
|
||||
void remove_and_destroy_if_empty(object_extent_set &set);
|
||||
using cache_set = boost::intrusive::set<object_extent_set>;
|
||||
cache_set per_object_caches;
|
||||
|
||||
uint64_t next_write_tid = 1;
|
||||
uint64_t next_read_tid = 1;
|
||||
struct pin_state {
|
||||
uint64_t tid = 0;
|
||||
enum pin_type_t {
|
||||
NONE,
|
||||
WRITE,
|
||||
};
|
||||
pin_type_t pin_type = NONE;
|
||||
bool is_write() const { return pin_type == WRITE; }
|
||||
|
||||
pin_state(const pin_state &other) = delete;
|
||||
pin_state &operator=(const pin_state &other) = delete;
|
||||
pin_state(pin_state &&other) = delete;
|
||||
pin_state() = default;
|
||||
|
||||
using list_member_options = boost::intrusive::member_hook<
|
||||
extent,
|
||||
boost::intrusive::list_member_hook<>,
|
||||
&extent::pin_list_member>;
|
||||
using list = boost::intrusive::list<extent, list_member_options>;
|
||||
list pin_list;
|
||||
~pin_state() {
|
||||
assert(pin_list.empty());
|
||||
assert(tid == 0);
|
||||
assert(pin_type == NONE);
|
||||
}
|
||||
void _open(uint64_t in_tid, pin_type_t in_type) {
|
||||
assert(pin_type == NONE);
|
||||
assert(in_tid > 0);
|
||||
tid = in_tid;
|
||||
pin_type = in_type;
|
||||
}
|
||||
};
|
||||
|
||||
void release_pin(pin_state &p) {
|
||||
for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) {
|
||||
unique_ptr<extent> extent(&*iter); // we now own this
|
||||
iter++; // unlink will invalidate
|
||||
assert(extent->parent_extent_set);
|
||||
auto &eset = *(extent->parent_extent_set);
|
||||
extent->unlink();
|
||||
remove_and_destroy_if_empty(eset);
|
||||
}
|
||||
p.tid = 0;
|
||||
p.pin_type = pin_state::NONE;
|
||||
}
|
||||
|
||||
public:
|
||||
class write_pin : private pin_state {
|
||||
friend class ExtentCache;
|
||||
private:
|
||||
void open(uint64_t in_tid) {
|
||||
_open(in_tid, pin_state::WRITE);
|
||||
}
|
||||
public:
|
||||
write_pin() : pin_state() {}
|
||||
};
|
||||
|
||||
void open_write_pin(write_pin &pin) {
|
||||
pin.open(next_write_tid++);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reserves extents required for rmw, and learn
|
||||
* which need to be read
|
||||
*
|
||||
* Pins all extents in to_write. Returns subset of to_read not
|
||||
* currently present in the cache. Caller must obtain those
|
||||
* extents before calling get_remaining_extents_for_rmw.
|
||||
*
|
||||
* Transition table:
|
||||
* - Empty -> Write Pending pin.reqid
|
||||
* - Write Pending N -> Write Pending pin.reqid
|
||||
* - Write Pinned N -> Write Pinned pin.reqid
|
||||
*
|
||||
* @param oid [in] object undergoing rmw
|
||||
* @param pin [in,out] pin to use (obtained from create_write_pin)
|
||||
* @param to_write [in] extents which will be written
|
||||
* @param to_read [in] extents to read prior to write (must be subset
|
||||
* of to_write)
|
||||
* @return subset of to_read which isn't already present or pending
|
||||
*/
|
||||
extent_set reserve_extents_for_rmw(
|
||||
const hobject_t &oid,
|
||||
write_pin &pin,
|
||||
const extent_set &to_write,
|
||||
const extent_set &to_read);
|
||||
|
||||
/**
|
||||
* Gets extents required for rmw not returned from
|
||||
* reserve_extents_for_rmw
|
||||
*
|
||||
* Requested extents (to_get) must be the set to_read \ the set
|
||||
* returned from reserve_extents_for_rmw. No transition table,
|
||||
* all extents at this point must be present and already pinned
|
||||
* for this pin by reserve_extents_for_rmw.
|
||||
*
|
||||
* @param oid [in] object
|
||||
* @param pin [in,out] pin associated with this IO
|
||||
* @param to_get [in] extents to get (see above for restrictions)
|
||||
* @return map of buffers from to_get
|
||||
*/
|
||||
extent_map get_remaining_extents_for_rmw(
|
||||
const hobject_t &oid,
|
||||
write_pin &pin,
|
||||
const extent_set &to_get);
|
||||
|
||||
/**
|
||||
* Updates the cache to reflect the rmw write
|
||||
*
|
||||
* All presented extents must already have been specified in
|
||||
* reserve_extents_for_rmw under to_write.
|
||||
*
|
||||
* Transition table:
|
||||
* - Empty -> invalid, must call reserve_extents_for_rmw first
|
||||
* - Write Pending N -> Write Pinned N, update buffer
|
||||
* (assert N >= pin.reqid)
|
||||
* - Write Pinned N -> Update buffer (assert N >= pin.reqid)
|
||||
*
|
||||
* @param oid [in] object
|
||||
* @param pin [in,out] pin associated with this IO
|
||||
* @param extents [in] map of buffers to update
|
||||
* @return void
|
||||
*/
|
||||
void present_rmw_update(
|
||||
const hobject_t &oid,
|
||||
write_pin &pin,
|
||||
const extent_map &extents);
|
||||
|
||||
/**
|
||||
* Release all buffers pinned by pin
|
||||
*/
|
||||
void release_write_pin(
|
||||
write_pin &pin) {
|
||||
release_pin(pin);
|
||||
}
|
||||
|
||||
ostream &print(
|
||||
ostream &out) const;
|
||||
};
|
||||
|
||||
ostream &operator<<(ostream &lhs, const ExtentCache &cache);
|
||||
|
||||
#endif
|
@ -77,6 +77,13 @@ add_executable(unittest_osd_osdcap
|
||||
add_ceph_unittest(unittest_osd_osdcap ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_osd_osdcap)
|
||||
target_link_libraries(unittest_osd_osdcap osd global ${BLKID_LIBRARIES})
|
||||
|
||||
# unittest ExtentCache
|
||||
add_executable(unittest_extent_cache
|
||||
test_extent_cache.cc
|
||||
)
|
||||
add_ceph_unittest(unittest_extent_cache ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_extent_cache)
|
||||
target_link_libraries(unittest_extent_cache osd global ${BLKID_LIBRARIES})
|
||||
|
||||
# unittest PGTransaction
|
||||
add_executable(unittest_pg_transaction
|
||||
test_pg_transaction.cc
|
||||
|
168
src/test/osd/test_extent_cache.cc
Normal file
168
src/test/osd/test_extent_cache.cc
Normal file
@ -0,0 +1,168 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2016 Red Hat
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "osd/ExtentCache.h"
|
||||
#include <iostream>
|
||||
|
||||
extent_map imap_from_vector(vector<pair<uint64_t, uint64_t> > &&in)
|
||||
{
|
||||
extent_map out;
|
||||
for (auto &&tup: in) {
|
||||
bufferlist bl;
|
||||
bl.append_zero(tup.second);
|
||||
out.insert(tup.first, bl.length(), bl);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
extent_map imap_from_iset(const extent_set &set)
|
||||
{
|
||||
extent_map out;
|
||||
for (auto &&iter: set) {
|
||||
bufferlist bl;
|
||||
bl.append_zero(iter.second);
|
||||
out.insert(iter.first, iter.second, bl);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
extent_set iset_from_vector(vector<pair<uint64_t, uint64_t> > &&in)
|
||||
{
|
||||
extent_set out;
|
||||
for (auto &&tup: in) {
|
||||
out.insert(tup.first, tup.second);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
TEST(extentcache, simple_write)
|
||||
{
|
||||
hobject_t oid;
|
||||
|
||||
ExtentCache c;
|
||||
ExtentCache::write_pin pin;
|
||||
c.open_write_pin(pin);
|
||||
|
||||
auto to_read = iset_from_vector(
|
||||
{{0, 2}, {8, 2}, {20, 2}});
|
||||
auto to_write = iset_from_vector(
|
||||
{{0, 10}, {20, 4}});
|
||||
auto must_read = c.reserve_extents_for_rmw(
|
||||
oid, pin, to_write, to_read);
|
||||
ASSERT_EQ(
|
||||
must_read,
|
||||
to_read);
|
||||
|
||||
c.print(std::cerr);
|
||||
|
||||
auto got = imap_from_iset(must_read);
|
||||
auto pending_read = to_read;
|
||||
pending_read.subtract(must_read);
|
||||
|
||||
auto pending = c.get_remaining_extents_for_rmw(
|
||||
oid,
|
||||
pin,
|
||||
pending_read);
|
||||
ASSERT_TRUE(pending.empty());
|
||||
|
||||
auto write_map = imap_from_iset(to_write);
|
||||
c.present_rmw_update(
|
||||
oid,
|
||||
pin,
|
||||
write_map);
|
||||
|
||||
c.release_write_pin(pin);
|
||||
}
|
||||
|
||||
TEST(extentcache, write_write_overlap)
|
||||
{
|
||||
hobject_t oid;
|
||||
|
||||
ExtentCache c;
|
||||
ExtentCache::write_pin pin;
|
||||
c.open_write_pin(pin);
|
||||
|
||||
// start write 1
|
||||
auto to_read = iset_from_vector(
|
||||
{{0, 2}, {8, 2}, {20, 2}});
|
||||
auto to_write = iset_from_vector(
|
||||
{{0, 10}, {20, 4}});
|
||||
auto must_read = c.reserve_extents_for_rmw(
|
||||
oid, pin, to_write, to_read);
|
||||
ASSERT_EQ(
|
||||
must_read,
|
||||
to_read);
|
||||
|
||||
c.print(std::cerr);
|
||||
|
||||
// start write 2
|
||||
ExtentCache::write_pin pin2;
|
||||
c.open_write_pin(pin2);
|
||||
auto to_read2 = iset_from_vector(
|
||||
{{2, 4}, {10, 4}, {18, 4}});
|
||||
auto to_write2 = iset_from_vector(
|
||||
{{2, 12}, {18, 12}});
|
||||
auto must_read2 = c.reserve_extents_for_rmw(
|
||||
oid, pin2, to_write2, to_read2);
|
||||
ASSERT_EQ(
|
||||
must_read2,
|
||||
iset_from_vector({{10, 4}, {18, 2}}));
|
||||
|
||||
c.print(std::cerr);
|
||||
|
||||
// complete read for write 1 and start commit
|
||||
auto got = imap_from_iset(must_read);
|
||||
auto pending_read = to_read;
|
||||
pending_read.subtract(must_read);
|
||||
auto pending = c.get_remaining_extents_for_rmw(
|
||||
oid,
|
||||
pin,
|
||||
pending_read);
|
||||
ASSERT_TRUE(pending.empty());
|
||||
|
||||
auto write_map = imap_from_iset(to_write);
|
||||
c.present_rmw_update(
|
||||
oid,
|
||||
pin,
|
||||
write_map);
|
||||
|
||||
c.print(std::cerr);
|
||||
|
||||
// complete read for write 2 and start commit
|
||||
auto pending_read2 = to_read2;
|
||||
pending_read2.subtract(must_read2);
|
||||
auto pending2 = c.get_remaining_extents_for_rmw(
|
||||
oid,
|
||||
pin2,
|
||||
pending_read2);
|
||||
ASSERT_EQ(
|
||||
pending2,
|
||||
imap_from_iset(pending_read2));
|
||||
|
||||
auto write_map2 = imap_from_iset(to_write2);
|
||||
c.present_rmw_update(
|
||||
oid,
|
||||
pin2,
|
||||
write_map2);
|
||||
|
||||
c.print(std::cerr);
|
||||
|
||||
c.release_write_pin(pin);
|
||||
|
||||
c.print(std::cerr);
|
||||
|
||||
c.release_write_pin(pin2);
|
||||
}
|
Loading…
Reference in New Issue
Block a user