From 881f61bfcca3f57bfd8ba405093ef2f6c77d9457 Mon Sep 17 00:00:00 2001 From: Thomas Schoebel-Theuer Date: Wed, 26 Jul 2017 10:00:51 +0200 Subject: [PATCH 1/5] main: fix abort of logfile fetching --- kernel/sy_old/mars_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sy_old/mars_main.c b/kernel/sy_old/mars_main.c index 15a9409b..b95bf577 100644 --- a/kernel/sy_old/mars_main.c +++ b/kernel/sy_old/mars_main.c @@ -3881,6 +3881,7 @@ done: if (fetch_brick && (fetch_brick->power.led_off || fetch_brick->power.force_off || + fetch_brick->copy_error || !global->global_power.button || !_check_allow(global, parent, "connect") || !_check_allow(global, parent, "attach") || From cf84aeba9fbedbaa7656b92a3b90aa0079207f58 Mon Sep 17 00:00:00 2001 From: Thomas Schoebel-Theuer Date: Wed, 26 Jul 2017 10:27:04 +0200 Subject: [PATCH 2/5] main: avoid fetch from failed peer for a while --- kernel/sy_old/mars_main.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/sy_old/mars_main.c b/kernel/sy_old/mars_main.c index b95bf577..6f0cbf5c 100644 --- a/kernel/sy_old/mars_main.c +++ b/kernel/sy_old/mars_main.c @@ -554,6 +554,7 @@ struct mars_rotate { struct if_brick *if_brick; const char *fetch_path; const char *fetch_peer; + const char *avoid_peer; const char *preferred_peer; const char *parent_path; const char *parent_rest; @@ -578,6 +579,7 @@ struct mars_rotate { int fetch_next_is_available; int relevant_serial; int replay_code; + int avoid_count; bool has_symlinks; bool res_shutdown; bool has_error; @@ -1855,14 +1857,16 @@ int check_logfile(const char *peer, struct mars_dent *remote_dent, struct mars_d } else if (!rot->fetch_serial && rot->allow_update && !rot->is_primary && !rot->old_is_primary && (!rot->preferred_peer || !strcmp(rot->preferred_peer, peer)) && + (!rot->avoid_peer || strcmp(peer, rot->avoid_peer) || rot->avoid_count-- <= 0) && (!rot->split_brain_serial || remote_dent->d_serial < rot->split_brain_serial) && - (dst_size < src_size || !local_dent)) { + (dst_size < src_size || !local_dent)) { // start copy brick instance status = _update_file(parent, switch_path, rot->fetch_path, remote_dent->d_path, peer, src_size); MARS_DBG("update '%s' from peer '%s' status = %d\n", remote_dent->d_path, peer, status); if (likely(status >= 0)) { rot->fetch_serial = remote_dent->d_serial; rot->fetch_next_is_available = 0; + brick_string_free(rot->avoid_peer); brick_string_free(rot->fetch_peer); rot->fetch_peer = brick_strdup(peer); } @@ -2752,6 +2756,7 @@ void rot_destruct(void *_rot) rot->log_say = NULL; brick_string_free(rot->fetch_path); brick_string_free(rot->fetch_peer); + brick_string_free(rot->avoid_peer); brick_string_free(rot->preferred_peer); brick_string_free(rot->parent_path); brick_string_free(rot->parent_rest); @@ -3894,6 +3899,10 @@ done: if (fetch_brick->inputs[i] && fetch_brick->inputs[i]->brick) fetch_brick->inputs[i]->brick->power.io_timeout = 1; } + if (fetch_brick->copy_error && !rot->avoid_peer && rot->fetch_peer) { + rot->avoid_peer = brick_strdup(rot->fetch_peer); + rot->avoid_count = 3; + } status = mars_kill_brick((void*)fetch_brick); if (status < 0) { MARS_ERR("could not kill fetch_brick, status = %d\n", status); From 3d9bbcb23c74b1777c02867940cbcaf03ab83108 Mon Sep 17 00:00:00 2001 From: Thomas Schoebel-Theuer Date: Thu, 20 Jul 2017 15:14:07 +0200 Subject: [PATCH 3/5] main: check alive link timestamp instead of tree version The tree version might be updated less frequently. --- kernel/sy_old/mars_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sy_old/mars_main.c b/kernel/sy_old/mars_main.c index 6f0cbf5c..628a2193 100644 --- a/kernel/sy_old/mars_main.c +++ b/kernel/sy_old/mars_main.c @@ -4314,7 +4314,7 @@ int _update_syncstatus(struct mars_rotate *rot, struct copy_brick *copy, char *p if (rot->sync_finish_stamp.tv_sec) { struct kstat peer_time_stat = {}; - peer_time_path = path_make("/mars/tree-%s", peer); + peer_time_path = path_make("/mars/alive-%s", peer); status = mars_stat(peer_time_path, &peer_time_stat, true); if (unlikely(status < 0)) { MARS_ERR("cannot stat '%s'\n", peer_time_path); From 45f944084289c4e4075505231be752b81cf988dc Mon Sep 17 00:00:00 2001 From: Thomas Schoebel-Theuer Date: Thu, 6 Jul 2017 08:49:24 +0200 Subject: [PATCH 4/5] main: protect rot list by rwsem --- kernel/sy_old/mars_main.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/sy_old/mars_main.c b/kernel/sy_old/mars_main.c index 628a2193..9a9de77f 100644 --- a/kernel/sy_old/mars_main.c +++ b/kernel/sy_old/mars_main.c @@ -603,6 +603,7 @@ struct mars_rotate { struct key_value_pair msgs[sizeof(rot_keys) / sizeof(char*)]; }; +static struct rw_semaphore rot_sem = __RWSEM_INITIALIZER(rot_sem); static LIST_HEAD(rot_anchor); /////////////////////////////////////////////////////////////////////// @@ -2750,7 +2751,9 @@ void rot_destruct(void *_rot) { struct mars_rotate *rot = _rot; if (likely(rot)) { + down_write(&rot_sem); list_del_init(&rot->rot_head); + up_write(&rot_sem); write_info_links(rot); del_channel(rot->log_say); rot->log_say = NULL; @@ -2819,8 +2822,11 @@ int make_log_init(void *buf, struct mars_dent *dent) rot->global = global; parent->d_private = rot; parent->d_private_destruct = rot_destruct; - list_add_tail(&rot->rot_head, &rot_anchor); assign_keys(rot->msgs, rot_keys); + + down_write(&rot_sem); + list_add_tail(&rot->rot_head, &rot_anchor); + up_write(&rot_sem); } rot->replay_link = NULL; From 34d7ebbd711b756c23968e65cc924cafbb9280a2 Mon Sep 17 00:00:00 2001 From: Thomas Schoebel-Theuer Date: Wed, 26 Jul 2017 11:15:58 +0200 Subject: [PATCH 5/5] all: release mars0.1stable43 --- ChangeLog | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ChangeLog b/ChangeLog index f4ac1e66..2c2f15be 100644 --- a/ChangeLog +++ b/ChangeLog @@ -135,6 +135,15 @@ Changelog for series 0.1b: ----------------------------------- Changelog for series 0.1: +mars0.1stable43 +-------- + * Major fix, only relevant for k >= 3 replicas: + Logfile fetch did not switch over to another alive peer + upon _speicfic_ network problems with the _current_ + peer. As a consequence, an unaffected replica could + hang. Workarould was possible by pause-fetch / + resume-fetch or by fixing the network :) + mars0.1stable42 -------- * Minor fix: ssh IPs and port numbers are automatically probed