Merge pull request #12212 from trociny/wip-16991

rbd-mirror: recovering after split-brain

Reviewed-by: Jason Dillaman <dillaman@redhat.com>
This commit is contained in:
Jason Dillaman 2016-12-05 11:20:03 -05:00 committed by GitHub
commit 566d01bfd2
8 changed files with 61 additions and 16 deletions

View File

@ -357,4 +357,18 @@ test -z "$(get_mirror_position ${CLUSTER2} ${POOL} ${image})"
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+error' 'disconnected'
testlog "TEST: split-brain"
image=split-brain
create_image ${CLUSTER2} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
demote_image ${CLUSTER2} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
promote_image ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER1} ${POOL} ${image} 10
demote_image ${CLUSTER1} ${POOL} ${image}
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+error' 'split-brain'
request_resync_image ${CLUSTER1} ${POOL} ${image} image_id
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
echo OK

View File

@ -483,8 +483,8 @@ test_status_in_pool_dir()
local status_log=${TEMPDIR}/${cluster}-${image}.mirror_status
rbd --cluster ${cluster} -p ${pool} mirror image status ${image} |
tee ${status_log} >&2
grep "state: .*${state_pattern}" ${status_log}
grep "description: .*${description_pattern}" ${status_log}
grep "state: .*${state_pattern}" ${status_log} || return 1
grep "description: .*${description_pattern}" ${status_log} || return 1
}
wait_for_status_in_pool_dir()

View File

@ -402,14 +402,14 @@ public:
remote_mirror_uuid,
&mock_journaler,
&m_mirror_peer_client_meta,
on_finish);
on_finish, &m_do_resync);
}
librbd::ImageCtx *m_remote_image_ctx;
librbd::ImageCtx *m_local_image_ctx = nullptr;
librbd::MockTestImageCtx *m_local_test_image_ctx = nullptr;
librbd::journal::MirrorPeerClientMeta m_mirror_peer_client_meta;
bool m_do_resync;
};
TEST_F(TestMockImageReplayerBootstrapRequest, NonPrimaryRemoteSyncingState) {

View File

@ -75,6 +75,7 @@ struct BootstrapRequest<librbd::MockTestImageCtx> {
::journal::MockJournalerProxy *journaler,
librbd::journal::MirrorPeerClientMeta *client_meta,
Context *on_finish,
bool *do_resync,
rbd::mirror::ProgressContext *progress_ctx = nullptr) {
assert(s_instance != nullptr);
s_instance->on_finish = on_finish;

View File

@ -398,7 +398,7 @@ void ImageReplayer<I>::bootstrap() {
&m_local_image_ctx, m_local_image_name, m_remote_image_id,
m_global_image_id, m_threads->work_queue, m_threads->timer,
&m_threads->timer_lock, m_local_mirror_uuid, m_remote_mirror_uuid,
m_remote_journaler, &m_client_meta, ctx, &m_progress_cxt);
m_remote_journaler, &m_client_meta, ctx, &m_do_resync, &m_progress_cxt);
{
Mutex::Locker locker(m_lock);
@ -429,6 +429,9 @@ void ImageReplayer<I>::handle_bootstrap(int r) {
dout(5) << "remote image is non-primary or local image is primary" << dendl;
on_start_fail(0, "remote image is non-primary or local image is primary");
return;
} else if (r == -EEXIST) {
on_start_fail(r, "split-brain detected");
return;
} else if (r < 0) {
on_start_fail(r, "error bootstrapping replay");
return;
@ -436,7 +439,6 @@ void ImageReplayer<I>::handle_bootstrap(int r) {
return;
}
assert(m_local_journal == nullptr);
{
RWLock::RLocker snap_locker(m_local_image_ctx->snap_lock);
@ -453,13 +455,8 @@ void ImageReplayer<I>::handle_bootstrap(int r) {
{
Mutex::Locker locker(m_lock);
bool do_resync = false;
r = m_local_image_ctx->journal->is_resync_requested(&do_resync);
if (r < 0) {
derr << "failed to check if a resync was requested" << dendl;
}
if (do_resync) {
if (m_do_resync) {
Context *on_finish = m_on_start_finish;
m_stopping_for_resync = true;
FunctionContext *ctx = new FunctionContext([this, on_finish](int r) {

View File

@ -253,6 +253,7 @@ private:
int m_last_r = 0;
std::string m_state_desc;
BootstrapProgressContext m_progress_cxt;
bool m_do_resync;
image_replayer::EventPreprocessor<ImageCtxT> *m_event_preprocessor = nullptr;
image_replayer::ReplayStatusFormatter<ImageCtxT> *m_replay_status_formatter =
nullptr;

View File

@ -52,6 +52,7 @@ BootstrapRequest<I>::BootstrapRequest(
Journaler *journaler,
MirrorPeerClientMeta *client_meta,
Context *on_finish,
bool *do_resync,
rbd::mirror::ProgressContext *progress_ctx)
: BaseRequest("rbd::mirror::image_replayer::BootstrapRequest",
reinterpret_cast<CephContext*>(local_io_ctx.cct()), on_finish),
@ -63,6 +64,7 @@ BootstrapRequest<I>::BootstrapRequest(
m_local_mirror_uuid(local_mirror_uuid),
m_remote_mirror_uuid(remote_mirror_uuid), m_journaler(journaler),
m_client_meta(client_meta), m_progress_ctx(progress_ctx),
m_do_resync(do_resync),
m_lock(unique_lock_name("BootstrapRequest::m_lock", this)) {
}
@ -73,6 +75,8 @@ BootstrapRequest<I>::~BootstrapRequest() {
template <typename I>
void BootstrapRequest<I>::send() {
*m_do_resync = false;
get_local_image_id();
}
@ -372,7 +376,33 @@ void BootstrapRequest<I>::handle_open_local_image(int r) {
m_ret_val = r;
close_remote_image();
return;
} if (m_client.state == cls::journal::CLIENT_STATE_DISCONNECTED) {
}
I *local_image_ctx = (*m_local_image_ctx);
{
RWLock::RLocker snap_locker(local_image_ctx->snap_lock);
if (local_image_ctx->journal == nullptr) {
derr << ": local image does not support journaling" << dendl;
m_ret_val = -EINVAL;
close_local_image();
return;
}
r = (*m_local_image_ctx)->journal->is_resync_requested(m_do_resync);
if (r < 0) {
derr << ": failed to check if a resync was requested" << dendl;
m_ret_val = r;
close_local_image();
return;
}
}
if (*m_do_resync) {
close_remote_image();
return;
}
if (m_client.state == cls::journal::CLIENT_STATE_DISCONNECTED) {
dout(10) << ": client flagged disconnected -- skipping bootstrap" << dendl;
// The caller is expected to detect disconnect initializing remote journal.
m_ret_val = 0;

View File

@ -52,14 +52,15 @@ public:
Journaler *journaler,
MirrorPeerClientMeta *client_meta,
Context *on_finish,
bool *do_resync,
ProgressContext *progress_ctx = nullptr) {
return new BootstrapRequest(local_io_ctx, remote_io_ctx,
image_sync_throttler, local_image_ctx,
local_image_name, remote_image_id,
global_image_id, work_queue, timer, timer_lock,
local_mirror_uuid, remote_mirror_uuid,
journaler, client_meta, on_finish,
progress_ctx);
journaler, client_meta, on_finish, do_resync,
progress_ctx);
}
BootstrapRequest(librados::IoCtx &local_io_ctx,
@ -73,7 +74,7 @@ public:
const std::string &local_mirror_uuid,
const std::string &remote_mirror_uuid, Journaler *journaler,
MirrorPeerClientMeta *client_meta, Context *on_finish,
ProgressContext *progress_ctx = nullptr);
bool *do_resync, ProgressContext *progress_ctx = nullptr);
~BootstrapRequest();
void send();
@ -158,6 +159,7 @@ private:
Journaler *m_journaler;
MirrorPeerClientMeta *m_client_meta;
ProgressContext *m_progress_ctx;
bool *m_do_resync;
Mutex m_lock;
bool m_canceled = false;