Merge pull request #9211 from dillaman/wip-15938

librbd: write-after-write might result in an inconsistent replicated image

Reviewed-by: Mykola Golub <mgolub@mirantis.com>
This commit is contained in:
Mykola Golub 2016-05-20 20:52:42 +03:00
commit 17b1c91774
6 changed files with 47 additions and 12 deletions

View File

@ -325,7 +325,8 @@ write_image()
local duration=$(($RANDOM % 35 + 15))
timeout ${duration}s rbd --cluster ${cluster} -p ${POOL} bench-write \
${image} --io-size 4096 --io-threads 8 --io-total 10G --io-pattern rand || true
${image} --io-size 4096 --io-threads 8 --io-total 10G --io-pattern rand \
--debug-rbd=20 --debug-journaler=20 2> ${TEMPDIR}/rbd-bench-write.log || true
}
create_snap()
@ -334,7 +335,8 @@ create_snap()
local image=$2
local snap_name=$3
rbd --cluster ${cluster} -p ${POOL} snap create ${image}@${snap_name}
rbd --cluster ${cluster} -p ${POOL} snap create ${image}@${snap_name} \
--debug-rbd=20 --debug-journaler=20 2> ${TEMPDIR}/rbd-snap-create.log
}
wait_for_snap()

View File

@ -351,13 +351,13 @@ int JournalPlayer::process_playback(uint64_t object_number) {
ldout(m_cct, 10) << __func__ << ": object_num=" << object_number << dendl;
assert(m_lock.is_locked());
ObjectPlayerPtr object_player = get_object_player();
if (verify_playback_ready()) {
notify_entries_available();
} else if (is_object_set_ready()) {
if (m_watch_enabled) {
schedule_watch();
} else {
ObjectPlayerPtr object_player = get_object_player();
uint8_t splay_width = m_journal_metadata->get_splay_width();
uint64_t active_set = m_journal_metadata->get_active_set();
uint64_t object_set = object_player->get_object_number() / splay_width;

View File

@ -167,6 +167,27 @@ namespace librbd {
}
};
struct C_CommitIOEventExtent : public Context {
ImageCtx *image_ctx;
uint64_t journal_tid;
uint64_t offset;
uint64_t length;
C_CommitIOEventExtent(ImageCtx *image_ctx, uint64_t journal_tid,
uint64_t offset, uint64_t length)
: image_ctx(image_ctx), journal_tid(journal_tid), offset(offset),
length(length) {
}
virtual void finish(int r) {
// all IO operations are flushed prior to closing the journal
assert(image_ctx->journal != nullptr);
image_ctx->journal->commit_io_event_extent(journal_tid, offset, length,
r);
}
};
LibrbdWriteback::LibrbdWriteback(ImageCtx *ictx, Mutex& lock)
: m_tid(0), m_lock(lock), m_ictx(ictx) {
}
@ -248,8 +269,8 @@ namespace librbd {
assert(journal_tid == 0 || m_ictx->journal != NULL);
if (journal_tid != 0) {
m_ictx->journal->flush_event(
journal_tid, new C_WriteJournalCommit(m_ictx, oid.name, object_no, off,
bl, snapc, req_comp,
journal_tid, new C_WriteJournalCommit(m_ictx, oid.name, object_no, off,
bl, snapc, req_comp,
journal_tid));
} else {
AioObjectWrite *req = new AioObjectWrite(m_ictx, oid.name, object_no,
@ -262,22 +283,32 @@ namespace librbd {
void LibrbdWriteback::overwrite_extent(const object_t& oid, uint64_t off,
uint64_t len,
ceph_tid_t journal_tid) {
ceph_tid_t original_journal_tid,
ceph_tid_t new_journal_tid) {
typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
assert(m_ictx->owner_lock.is_locked());
uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
// all IO operations are flushed prior to closing the journal
assert(journal_tid != 0 && m_ictx->journal != NULL);
assert(original_journal_tid != 0 && m_ictx->journal != NULL);
Extents file_extents;
Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, object_no, off,
len, file_extents);
for (Extents::iterator it = file_extents.begin();
it != file_extents.end(); ++it) {
m_ictx->journal->commit_io_event_extent(journal_tid, it->first,
it->second, 0);
if (new_journal_tid != 0) {
// ensure new journal event is safely committed to disk before
// committing old event
m_ictx->journal->flush_event(
new_journal_tid, new C_CommitIOEventExtent(m_ictx,
original_journal_tid,
it->first, it->second));
} else {
m_ictx->journal->commit_io_event_extent(original_journal_tid, it->first,
it->second, 0);
}
}
}

View File

@ -42,7 +42,8 @@ namespace librbd {
using WritebackHandler::write;
virtual void overwrite_extent(const object_t& oid, uint64_t off,
uint64_t len, ceph_tid_t journal_tid);
uint64_t len, ceph_tid_t original_journal_tid,
ceph_tid_t new_journal_tid);
virtual void get_client_lock();
virtual void put_client_lock();

View File

@ -462,7 +462,7 @@ void ObjectCacher::Object::replace_journal_tid(BufferHead *bh,
if (bh_tid != 0 && bh_tid != tid) {
// inform journal that it should not expect a writeback from this extent
oc->writeback_handler.overwrite_extent(get_oid(), bh->start(),
bh->length(), bh_tid);
bh->length(), bh_tid, tid);
}
bh->set_journal_tid(tid);
}

View File

@ -37,7 +37,8 @@ class WritebackHandler {
ceph_tid_t journal_tid, Context *oncommit) = 0;
virtual void overwrite_extent(const object_t& oid, uint64_t off, uint64_t len,
ceph_tid_t journal_tid) {}
ceph_tid_t original_journal_tid,
ceph_tid_t new_journal_tid) {}
virtual bool can_scattered_write() { return false; }
virtual ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,