mirror of
https://github.com/ceph/ceph
synced 2025-01-04 02:02:36 +00:00
Merge pull request #12212 from trociny/wip-16991
rbd-mirror: recovering after split-brain Reviewed-by: Jason Dillaman <dillaman@redhat.com>
This commit is contained in:
commit
566d01bfd2
@ -357,4 +357,18 @@ test -z "$(get_mirror_position ${CLUSTER2} ${POOL} ${image})"
|
||||
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
|
||||
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+error' 'disconnected'
|
||||
|
||||
testlog "TEST: split-brain"
|
||||
image=split-brain
|
||||
create_image ${CLUSTER2} ${POOL} ${image}
|
||||
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
|
||||
demote_image ${CLUSTER2} ${POOL} ${image}
|
||||
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
|
||||
promote_image ${CLUSTER1} ${POOL} ${image}
|
||||
write_image ${CLUSTER1} ${POOL} ${image} 10
|
||||
demote_image ${CLUSTER1} ${POOL} ${image}
|
||||
promote_image ${CLUSTER2} ${POOL} ${image}
|
||||
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+error' 'split-brain'
|
||||
request_resync_image ${CLUSTER1} ${POOL} ${image} image_id
|
||||
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
|
||||
|
||||
echo OK
|
||||
|
@ -483,8 +483,8 @@ test_status_in_pool_dir()
|
||||
local status_log=${TEMPDIR}/${cluster}-${image}.mirror_status
|
||||
rbd --cluster ${cluster} -p ${pool} mirror image status ${image} |
|
||||
tee ${status_log} >&2
|
||||
grep "state: .*${state_pattern}" ${status_log}
|
||||
grep "description: .*${description_pattern}" ${status_log}
|
||||
grep "state: .*${state_pattern}" ${status_log} || return 1
|
||||
grep "description: .*${description_pattern}" ${status_log} || return 1
|
||||
}
|
||||
|
||||
wait_for_status_in_pool_dir()
|
||||
|
@ -402,14 +402,14 @@ public:
|
||||
remote_mirror_uuid,
|
||||
&mock_journaler,
|
||||
&m_mirror_peer_client_meta,
|
||||
on_finish);
|
||||
on_finish, &m_do_resync);
|
||||
}
|
||||
|
||||
librbd::ImageCtx *m_remote_image_ctx;
|
||||
librbd::ImageCtx *m_local_image_ctx = nullptr;
|
||||
librbd::MockTestImageCtx *m_local_test_image_ctx = nullptr;
|
||||
librbd::journal::MirrorPeerClientMeta m_mirror_peer_client_meta;
|
||||
|
||||
bool m_do_resync;
|
||||
};
|
||||
|
||||
TEST_F(TestMockImageReplayerBootstrapRequest, NonPrimaryRemoteSyncingState) {
|
||||
|
@ -75,6 +75,7 @@ struct BootstrapRequest<librbd::MockTestImageCtx> {
|
||||
::journal::MockJournalerProxy *journaler,
|
||||
librbd::journal::MirrorPeerClientMeta *client_meta,
|
||||
Context *on_finish,
|
||||
bool *do_resync,
|
||||
rbd::mirror::ProgressContext *progress_ctx = nullptr) {
|
||||
assert(s_instance != nullptr);
|
||||
s_instance->on_finish = on_finish;
|
||||
|
@ -398,7 +398,7 @@ void ImageReplayer<I>::bootstrap() {
|
||||
&m_local_image_ctx, m_local_image_name, m_remote_image_id,
|
||||
m_global_image_id, m_threads->work_queue, m_threads->timer,
|
||||
&m_threads->timer_lock, m_local_mirror_uuid, m_remote_mirror_uuid,
|
||||
m_remote_journaler, &m_client_meta, ctx, &m_progress_cxt);
|
||||
m_remote_journaler, &m_client_meta, ctx, &m_do_resync, &m_progress_cxt);
|
||||
|
||||
{
|
||||
Mutex::Locker locker(m_lock);
|
||||
@ -429,6 +429,9 @@ void ImageReplayer<I>::handle_bootstrap(int r) {
|
||||
dout(5) << "remote image is non-primary or local image is primary" << dendl;
|
||||
on_start_fail(0, "remote image is non-primary or local image is primary");
|
||||
return;
|
||||
} else if (r == -EEXIST) {
|
||||
on_start_fail(r, "split-brain detected");
|
||||
return;
|
||||
} else if (r < 0) {
|
||||
on_start_fail(r, "error bootstrapping replay");
|
||||
return;
|
||||
@ -436,7 +439,6 @@ void ImageReplayer<I>::handle_bootstrap(int r) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
assert(m_local_journal == nullptr);
|
||||
{
|
||||
RWLock::RLocker snap_locker(m_local_image_ctx->snap_lock);
|
||||
@ -453,13 +455,8 @@ void ImageReplayer<I>::handle_bootstrap(int r) {
|
||||
|
||||
{
|
||||
Mutex::Locker locker(m_lock);
|
||||
bool do_resync = false;
|
||||
r = m_local_image_ctx->journal->is_resync_requested(&do_resync);
|
||||
if (r < 0) {
|
||||
derr << "failed to check if a resync was requested" << dendl;
|
||||
}
|
||||
|
||||
if (do_resync) {
|
||||
if (m_do_resync) {
|
||||
Context *on_finish = m_on_start_finish;
|
||||
m_stopping_for_resync = true;
|
||||
FunctionContext *ctx = new FunctionContext([this, on_finish](int r) {
|
||||
|
@ -253,6 +253,7 @@ private:
|
||||
int m_last_r = 0;
|
||||
std::string m_state_desc;
|
||||
BootstrapProgressContext m_progress_cxt;
|
||||
bool m_do_resync;
|
||||
image_replayer::EventPreprocessor<ImageCtxT> *m_event_preprocessor = nullptr;
|
||||
image_replayer::ReplayStatusFormatter<ImageCtxT> *m_replay_status_formatter =
|
||||
nullptr;
|
||||
|
@ -52,6 +52,7 @@ BootstrapRequest<I>::BootstrapRequest(
|
||||
Journaler *journaler,
|
||||
MirrorPeerClientMeta *client_meta,
|
||||
Context *on_finish,
|
||||
bool *do_resync,
|
||||
rbd::mirror::ProgressContext *progress_ctx)
|
||||
: BaseRequest("rbd::mirror::image_replayer::BootstrapRequest",
|
||||
reinterpret_cast<CephContext*>(local_io_ctx.cct()), on_finish),
|
||||
@ -63,6 +64,7 @@ BootstrapRequest<I>::BootstrapRequest(
|
||||
m_local_mirror_uuid(local_mirror_uuid),
|
||||
m_remote_mirror_uuid(remote_mirror_uuid), m_journaler(journaler),
|
||||
m_client_meta(client_meta), m_progress_ctx(progress_ctx),
|
||||
m_do_resync(do_resync),
|
||||
m_lock(unique_lock_name("BootstrapRequest::m_lock", this)) {
|
||||
}
|
||||
|
||||
@ -73,6 +75,8 @@ BootstrapRequest<I>::~BootstrapRequest() {
|
||||
|
||||
template <typename I>
|
||||
void BootstrapRequest<I>::send() {
|
||||
*m_do_resync = false;
|
||||
|
||||
get_local_image_id();
|
||||
}
|
||||
|
||||
@ -372,7 +376,33 @@ void BootstrapRequest<I>::handle_open_local_image(int r) {
|
||||
m_ret_val = r;
|
||||
close_remote_image();
|
||||
return;
|
||||
} if (m_client.state == cls::journal::CLIENT_STATE_DISCONNECTED) {
|
||||
}
|
||||
|
||||
I *local_image_ctx = (*m_local_image_ctx);
|
||||
{
|
||||
RWLock::RLocker snap_locker(local_image_ctx->snap_lock);
|
||||
if (local_image_ctx->journal == nullptr) {
|
||||
derr << ": local image does not support journaling" << dendl;
|
||||
m_ret_val = -EINVAL;
|
||||
close_local_image();
|
||||
return;
|
||||
}
|
||||
|
||||
r = (*m_local_image_ctx)->journal->is_resync_requested(m_do_resync);
|
||||
if (r < 0) {
|
||||
derr << ": failed to check if a resync was requested" << dendl;
|
||||
m_ret_val = r;
|
||||
close_local_image();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (*m_do_resync) {
|
||||
close_remote_image();
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_client.state == cls::journal::CLIENT_STATE_DISCONNECTED) {
|
||||
dout(10) << ": client flagged disconnected -- skipping bootstrap" << dendl;
|
||||
// The caller is expected to detect disconnect initializing remote journal.
|
||||
m_ret_val = 0;
|
||||
|
@ -52,14 +52,15 @@ public:
|
||||
Journaler *journaler,
|
||||
MirrorPeerClientMeta *client_meta,
|
||||
Context *on_finish,
|
||||
bool *do_resync,
|
||||
ProgressContext *progress_ctx = nullptr) {
|
||||
return new BootstrapRequest(local_io_ctx, remote_io_ctx,
|
||||
image_sync_throttler, local_image_ctx,
|
||||
local_image_name, remote_image_id,
|
||||
global_image_id, work_queue, timer, timer_lock,
|
||||
local_mirror_uuid, remote_mirror_uuid,
|
||||
journaler, client_meta, on_finish,
|
||||
progress_ctx);
|
||||
journaler, client_meta, on_finish, do_resync,
|
||||
progress_ctx);
|
||||
}
|
||||
|
||||
BootstrapRequest(librados::IoCtx &local_io_ctx,
|
||||
@ -73,7 +74,7 @@ public:
|
||||
const std::string &local_mirror_uuid,
|
||||
const std::string &remote_mirror_uuid, Journaler *journaler,
|
||||
MirrorPeerClientMeta *client_meta, Context *on_finish,
|
||||
ProgressContext *progress_ctx = nullptr);
|
||||
bool *do_resync, ProgressContext *progress_ctx = nullptr);
|
||||
~BootstrapRequest();
|
||||
|
||||
void send();
|
||||
@ -158,6 +159,7 @@ private:
|
||||
Journaler *m_journaler;
|
||||
MirrorPeerClientMeta *m_client_meta;
|
||||
ProgressContext *m_progress_ctx;
|
||||
bool *m_do_resync;
|
||||
Mutex m_lock;
|
||||
bool m_canceled = false;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user