MINOR: mworker: add support for case when new worker dies

The case, when the new worker fails while it parses its configuration or while it tries to apply it, could be considered as the new one, because the master process is no longer need to reexec again. The master simply keeps the previous worker (forked before the reload) and it let the new one to exit with failure. When the new worker exits, in the master process context (mworker_catch_sigchld) we need to stop a MASTER proxy listener and we need to drop the server, attached to new worker's CLI sockpair (it's inherited in master). Then we explicitly delete master's end of this sockpair (child->ipc_fd[0]) from the fdtab and we free the memory allocated for the worker process. on_new_child_failure() is called before the clean up to signal systemd that reload/load was failed. If the new worker fails during the first start, so there is no any previous worker, master process should exit immediately in order to keep the same behaviour, as it was before this architecture change.
2025-02-21 05:06:56 +00:00 · 2024-10-09 15:44:43 +02:00 · 2024-10-09 15:44:43 +02:00 · c8aac63893
commit c8aac63893
parent 2bb07b913d
1 changed files with 34 additions and 1 deletions
--- a/src/mworker.c
+++ b/src/mworker.c
@ -341,6 +341,8 @@ void mworker_catch_sigchld(struct sig_handler *sh)
 	int exitpid = -1;
 	int status = 0;
 	int childfound;
+	struct listener *l, *l_next;
+	struct proxy *curproxy;

 restart_wait:

@ -372,6 +374,36 @@ restart_wait:
 		if (!childfound) {
 			/* We didn't find the PID in the list, that shouldn't happen but we can emit a warning */
 			ha_warning("Process %d exited with code %d (%s)\n", exitpid, status, (status >= 128) ? strsignal(status - 128) : "Exit");
+		} else if (child->options & PROC_O_INIT) {
+			on_new_child_failure();
+
+			/* Detach all listeners */
+			for (curproxy = proxies_list; curproxy; curproxy = curproxy->next) {
+				list_for_each_entry_safe(l, l_next, &curproxy->conf.listeners, by_fe) {
+					if ((l->rx.fd == child->ipc_fd[0]) || (l->rx.fd == child->ipc_fd[1])) {
+						unbind_listener(l);
+						delete_listener(l);
+					}
+				}
+			}
+
+			/* Drop server */
+			if (child->srv)
+				srv_drop(child->srv);
+
+			/* Delete fd from poller fdtab, which will close it */
+			fd_delete(child->ipc_fd[0]);
+			child->ipc_fd[0] = -1;
+			mworker_free_child(child);
+			child = NULL;
+
+			/* When worker fails during the first startup, there is
+			 * no previous workers with state PROC_O_LEAVING, master
+			 * process should exit here as well to keep the
+			 * previous behaviour
+			 */
+			if ((proc_self->options & PROC_O_TYPE_MASTER) && (proc_self->reloads == 0))
+				exit(status);
 		} else {
 			/* check if exited child is a current child */
 			if (!(child->options & PROC_O_LEAVING)) {
@ -390,7 +422,8 @@ restart_wait:
 						ha_warning("A worker process unexpectedly died and this can only be explained by a bug in haproxy or its dependencies.\nPlease check that you are running an up to date and maintained version of haproxy and open a bug report.\n");
 						display_version();
 					}
-					if (!(global.tune.options & GTUNE_NOEXIT_ONFAILURE)) {
+					/* new worker, which has been launched at reload has status PROC_O_INIT */
+					if (!(global.tune.options & GTUNE_NOEXIT_ONFAILURE) && !(child->options & PROC_O_INIT)) {
 						ha_alert("exit-on-failure: killing every processes with SIGTERM\n");
 						mworker_kill(SIGTERM);
 					}